Load Packages

In [1]:
# Packages for data loading and manipulation
import sqlite3
import pandas as pd
import numpy as np
import scipy
import re

# Packages for display
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.dates as mdates
from matplotlib.ticker import AutoMinorLocator
import mglearn as mg
from tqdm import tqdm
from IPython.display import display, HTML, Image
import gmplot
from pandas_ml import ConfusionMatrix

# Plot graphviz
from sklearn.externals.six import StringIO
from sklearn.tree import export_graphviz
import pydotplus

# Packeages for time series
from time import time
from datetime import datetime, timedelta
from bdateutil import isbday
import holidays

# Data manipulation tools
import more_itertools as mit

# Math
import random
from scipy.stats.stats import pearsonr  

# System
from wurlitzer import sys_pipes # This is used to read the Jupyter console output.
import warnings

import socket # Check if there is internet connection
In [2]:
# Machine learning models
import sklearn

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, ShuffleSplit, TimeSeriesSplit, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, learning_curve

from sklearn.preprocessing import MinMaxScaler, minmax_scale
from sklearn.decomposition import PCA

# Metrics
from sklearn.metrics import fbeta_score, accuracy_score, mean_squared_log_error, median_absolute_error, make_scorer
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report

# Unsupervised learning 
from sklearn.cluster import KMeans

# Regression algorithms
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.isotonic import IsotonicRegression # This regressor doesn't have a default parameter setting.
# from sklearn.linear_model import ARDRegression # This regressor takes too much time to model.
from sklearn.linear_model import HuberRegressor, Lasso, LassoCV, LinearRegression, PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor, Ridge, SGDRegressor, TheilSenRegressor 
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import KNeighborsRegressor, RadiusNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, NuSVR, SVR
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from xgboost import XGBRegressor

# Classification algorithms
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
# from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier
from sklearn.naive_bayes import GaussianNB # BernoulliNB and MultinomialNB are suitable only for discrete features
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
# from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from xgboost import XGBClassifier

import xgboost as xgb
In [3]:
# One can ignore the warning as this is a known issue for tensorflow 1.4.1, but it doesn't affect the usage.
# https://github.com/tensorflow/tensorflow/issues/14182
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import tensorflow as tf
    from tensorflow.python.client import device_lib
In [4]:
# Deep learning packages
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
# from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint
import keras.backend as K
from keras import initializers as K_init
from keras.optimizers import Adam
Using TensorFlow backend.
In [5]:
# Set a global seed for randomization
random_state = 16
np.random.seed(random_state)
In [6]:
import sys
print(sys.version)
3.6.4 |Anaconda custom (64-bit)| (default, Dec 21 2017, 21:42:08) 
[GCC 7.2.0]
In [7]:
# Print version of important Python packages
important_packages = [np, scipy, pd, matplotlib, sklearn, xgb, tf, keras]

for package in important_packages:
    print('{}: {}'.format(package.__name__, package.__version__))
numpy: 1.13.3
scipy: 1.0.0
pandas: 0.21.1
matplotlib: 2.1.1
sklearn: 0.19.1
xgboost: 0.6
tensorflow: 1.4.1
keras: 2.1.2
In [8]:
notebook_start_time = time()

Data Import

In [9]:
# First, import all the datasets.
# The status data cannot be directly imported due to its size. Below I will read the data through the database.
# The 'station.csv' needs to be read before one can process the sqlite in the below functions.
# Zip codes should be loaded as strings.
station = pd.read_csv('station.csv')
trip = pd.read_csv('trip.csv', dtype={'zip_code':'str'})
weather = pd.read_csv('weather.csv')
In [420]:
# If sql_status is true, the status data will be obtained from the sqlite file (which takes about 10 min) and saved to a new csv file named "status_change.csv".
# If false, the status data will be obtained from the preprocessed "status_change.csv" file.
sql_status = True
In [11]:
# Define a convenient funtion to use SQL query.
def sql(cursor, command):    
    cursor.execute(command)
    result = cursor.fetchall()
    return result     
In [12]:
def bike_status(station_id, sql_conn):
    '''The bike_status function iterates over stations and records only the changes of "bikes_available" and "docks_available" status.
    A Python 3.6 feature is used in this function. The f-string.'''
    
    status = pd.read_sql_query(f"SELECT * FROM status WHERE station_id = {station_id} ORDER BY time", con=sql_conn)
    
    # Register the change of status once the change is detected.
    # The exploration of status data later suggests that the datetime in status data is generally 1 minute later than that in the trip data. Both are recorded roughly every minute.
    status_compact = status[(status.bikes_available.diff(-1) * status.docks_available.diff(-1)) != 0]
    return status_compact
In [13]:
if sql_status == True:
    # Connect to the database in sqlite format.
    conn = sqlite3.connect('database.sqlite')
    cursor = conn.cursor()  
    
    # Explore the tables in the database.
    find_tables = "SELECT name FROM sqlite_master;"
    tables = sql(cursor, find_tables)
    print(tables)
[('station',), ('status',), ('trip',), ('weather',)]
In [14]:
if sql_status == True:
    # Investigate the "status" columns in the database.
    # Python 3.6 f-string used.
    print(f"Table {tables[1][0]} contains the following columns:")
    display(sql(cursor, f"PRAGMA table_info({tables[1][0]});")) # tables[1][0] is 'status'.
Table status contains the following columns:
[(0, 'station_id', 'INTEGER', 0, None, 0),
 (1, 'bikes_available', 'INTEGER', 0, None, 0),
 (2, 'docks_available', 'INTEGER', 0, None, 0),
 (3, 'time', 'TEXT', 0, None, 0)]
In [15]:
if sql_status == True:
    # It is nearly impossible to directly import the status.csv to a pandas dataframe. It takes too much time and space.
    # In fact, it is not necessary to take all the data as plenty of them contain little information. 
    # The bike_status function will "compress" the original big data.
    # It takes about 11 minutes to process the data.
    # The 'station.csv' needs to be read before running the following code.
    
    status_list = [bike_status(station_id, conn) for station_id in tqdm(station.id)]
    
    # Concatenate all the station status to form a dataframe "status".
    status = pd.concat(status_list, ignore_index=True)
            
    # Double check that the concatenation doesn't remove rows.
    sum_status = 0
    for bike_station in status_list:
        sum_status += len(bike_station)
    
    print(sum_status)
    print(status.shape)
       
    # Write a .csv data for future import to save time. 
    # The size of data shrinks from the original 2.0GB to less than 60MB without losing important information.
    status.to_csv("status_change.csv", index=False)
    conn.close()
else:
    status = pd.read_csv("status_change.csv")
100%|██████████| 70/70 [11:15<00:00,  9.64s/it]
2093372
(2093372, 4)
In [16]:
# Set the pandas option to display all the weather columns.
pd.set_option('display.max_columns', 50)

print(f"\nA sample of status data {status.shape}:"); display(status.head(3))
print(f"\nA sample of station data {station.shape}:"); display(station.head(3))
print(f"\nA sample of trip data {trip.shape}:"); display(trip.head(3))
print(f"\nA sample of weather data {weather.shape}:"); display(weather.head(3))
A sample of status data (2093372, 4):
station_id bikes_available docks_available time
0 2 2 25 2013/08/29 13:10:02
1 2 3 24 2013/08/29 13:52:01
2 2 2 25 2013/08/29 17:17:01
A sample of station data (70, 7):
id name lat long dock_count city installation_date
0 2 San Jose Diridon Caltrain Station 37.329732 -121.901782 27 San Jose 8/6/2013
1 3 San Jose Civic Center 37.330698 -121.888979 15 San Jose 8/5/2013
2 4 Santa Clara at Almaden 37.333988 -121.894902 11 San Jose 8/6/2013
A sample of trip data (669959, 11):
id duration start_date start_station_name start_station_id end_date end_station_name end_station_id bike_id subscription_type zip_code
0 4576 63 8/29/2013 14:13 South Van Ness at Market 66 8/29/2013 14:14 South Van Ness at Market 66 520 Subscriber 94127
1 4607 70 8/29/2013 14:42 San Jose City Hall 10 8/29/2013 14:43 San Jose City Hall 10 661 Subscriber 95138
2 4130 71 8/29/2013 10:16 Mountain View City Hall 27 8/29/2013 10:17 Mountain View City Hall 27 48 Subscriber 97214
A sample of weather data (3665, 24):
date max_temperature_f mean_temperature_f min_temperature_f max_dew_point_f mean_dew_point_f min_dew_point_f max_humidity mean_humidity min_humidity max_sea_level_pressure_inches mean_sea_level_pressure_inches min_sea_level_pressure_inches max_visibility_miles mean_visibility_miles min_visibility_miles max_wind_Speed_mph mean_wind_speed_mph max_gust_speed_mph precipitation_inches cloud_cover events wind_dir_degrees zip_code
0 8/29/2013 74.0 68.0 61.0 61.0 58.0 56.0 93.0 75.0 57.0 30.07 30.02 29.97 10.0 10.0 10.0 23.0 11.0 28.0 0 4.0 NaN 286.0 94107
1 8/30/2013 78.0 69.0 60.0 61.0 58.0 56.0 90.0 70.0 50.0 30.05 30.00 29.93 10.0 10.0 7.0 29.0 13.0 35.0 0 2.0 NaN 291.0 94107
2 8/31/2013 71.0 64.0 57.0 57.0 56.0 54.0 93.0 75.0 57.0 30.00 29.96 29.92 10.0 10.0 10.0 26.0 15.0 31.0 0 4.0 NaN 284.0 94107

Data Exploration

1. Exploration on 'station' data.

In [17]:
# First check whether there are nan values in station data.
station.isna().sum()
Out[17]:
id                   0
name                 0
lat                  0
long                 0
dock_count           0
city                 0
installation_date    0
dtype: int64

1.1 Mark stations on Google Map.

In [18]:
# Create a location format that is compatible with that for data in the below html file.
locations = np.column_stack((station.name, station.lat, station.long, station.id)).tolist()
print(locations[0])
['San Jose Diridon Caltrain Station', 37.329732, -121.901782, 2]
In [19]:
# Define a function here:
# Codes borrowed from:
# https://gist.github.com/parth1020/4481893
# https://github.com/vgm64/gmplot/blob/master/gmplot/gmplot.py
# Fremont location (37.548270, -121.988572)

para1 = '''
<html>
<head>  
    <title>Google Maps Multiple Markers</title>
    <script src="http://maps.google.com/maps/api/js?sensor=false&key=AIzaSyCsdvHMtyKFqb98ybFMw_q5QSipYEoZU7Y" type="text/javascript"></script>
</head>
<body>
    <div id="map" style="height: 600px; width: 1200px;">
    </div>
    <script type="text/javascript">
'''
para2 = f'var locations = {locations};'

para3 = '''
    var map = new google.maps.Map(document.getElementById('map'), {
      zoom: 10,
      center: new google.maps.LatLng(37.548270, -121.988572),
      mapTypeId: google.maps.MapTypeId.ROADMAP
    });

    var infowindow = new google.maps.InfoWindow();

    var marker, i;

    for (i = 0; i < locations.length; i++) { 
      marker = new google.maps.Marker({
        position: new google.maps.LatLng(locations[i][1], locations[i][2]),
        map: map
      });

      google.maps.event.addListener(marker, 'click', (function(marker, i) {
        return function() {
          infowindow.setContent(locations[i][0]);
          infowindow.open(map, marker);
        }
      })(marker, i));
    }
    </script>
</body>
</html>'''
In [20]:
# File name for writing the html code.
bike_map = "Bike_Map.html"
In [21]:
# Open the file to write the code and then exit it.
f = open(bike_map, 'w')
f.write(para1 + para2 + para3)
f.close()
In [22]:
# Define a function to check if there is internet connection.
# Code borrowed from:
# https://stackoverflow.com/questions/20913411/test-if-an-internet-connection-is-present-in-python

REMOTE_SERVER = "www.google.com"
def is_connected():
  try:
    # see if we can resolve the host name -- tells us if there is a DNS listening.
    host = socket.gethostbyname(REMOTE_SERVER)
    # connect to the host -- tells us if the host is actually reachable.
    s = socket.create_connection((host, 80), 2)
    return True
  except:
     pass
  return False
In [421]:
if is_connected():
    # Plot the markers on the interactive map with the station names, if there is internet
    display(HTML(f'<iframe src={bike_map} height="630px" width="100%"></iframe>'))
else:
    # Plot the pre-saved picture if there is no internet connection.
    display(Image(filename='Station_Map.png'))

1.2 Create a heat map to view the distribution of docks, thus the hot bike-sharing spots.

In [24]:
# https://manojsaha.com/2017/03/08/drawing-locations-google-maps-python/
# https://pypi.python.org/pypi/gmplot/1.1.1
# Initialize two empty lists to hold the latitude and longitude values
# Obtain coordinates of the stations multiplied by the number of dock counts on each station to see the heatmap of the available bikes.
latitudes = []
longitudes = []
for i in station.index:
    for dock in range(station.dock_count[i]):
        latitudes.append(station.lat[i])
        longitudes.append(station.long[i])
In [422]:
if is_connected():
    # Locate the center of the map to Fremont.
    # Sometimes the geocoding doesn't work very well. Just re-run the cell to clear the error message.
    # center = gmplot.GoogleMapPlotter.geocode("Fremont")
    
    center = (37.5482697, -121.9885719) # This is the Fremont coordinator
    gmap0 = gmplot.GoogleMapPlotter(center[0], center[1], zoom=10.5, apikey='AIzaSyCsdvHMtyKFqb98ybFMw_q5QSipYEoZU7Y')
    
    gmap0.heatmap(latitudes, longitudes)
    gmap0.draw("docks_heatmap.html")
    
    # Show the interactive hot map, if there is internet
    # As expected, most of the bikes are located in San Francisco and San Jose city centers as designed.
    display(HTML('<iframe src="docks_heatmap.html" height="630px" width="150%"></iframe>'))
else:
    # Plot the pre-saved picture if there is no internet connection.
    display(Image(filename='Docks_HotMap.png'))

As expected, San Francisco and San Jose are the two hottest spot, while Redwood city is also a hot spot. A reasonable clustering of stations will be explored below.

1.3 Station installation dates.

In [26]:
station.groupby('installation_date').agg({'dock_count': ['count', 'sum']})
Out[26]:
dock_count
count sum
installation_date
1/22/2014 1 15
12/31/2013 3 41
2/20/2014 1 15
4/9/2014 1 15
8/12/2013 4 60
8/14/2013 4 64
8/15/2013 5 97
8/16/2013 3 45
8/19/2013 6 94
8/20/2013 6 118
8/21/2013 5 87
8/22/2013 5 95
8/23/2013 7 149
8/25/2013 5 107
8/5/2013 5 83
8/6/2013 5 87
8/7/2013 4 64

Most of the stations are built around the same time in August 2013. However, there are still a few built later. In populous areas, if many people are using the sharebike service, the increase of stations wil have quite some effect on the trip counts in that area. This information will be used later in the regression problem.

2. Exploration on 'trip' data.

2.1 Find independent station groups with least inter-group trips.

In [27]:
# Use clustering techniques to plot locations.
b_loc = station[['lat', 'long']] # b_loc: bike locations

# Use KMeans to find the clusters with the number of 3 and 5.
fig, axes = plt.subplots(1, 2, figsize=(10,5))

# using three cluster centers:
kmeans_3 = KMeans(n_clusters=3, random_state=random_state).fit(b_loc)
mg.discrete_scatter(b_loc['lat'], b_loc['long'], kmeans_3.labels_, ax=axes[0])

# using five cluster centers:
kmeans_5 = KMeans(n_clusters=5, random_state=random_state).fit(b_loc)
mg.discrete_scatter(b_loc['lat'], b_loc['long'], kmeans_5.labels_, ax=axes[1])

plt.legend()
plt.show()
In [28]:
# Calculate the percentage of trips between stations. Most of the trips are between stations.
same_station = (trip.start_station_id != trip.end_station_id).nonzero()[0]
print("The percentage of trips between stations is {:.3%}.".format(len(same_station)/len(trip)))
The percentage of trips between stations is 96.421%.
In [29]:
# Define a function to quantify the reasonability of the KMeans clustering.
# Trips between clusters should be minimal.
def cluster2cluster(kmeans_labels, n_cluster, color='g'):
    # Group stations with the same cluster label.
    id_groups = [station.loc[kmeans_labels == i, 'id'].values for i in range(0, n_cluster)]
    
    s2s = trip.loc[:, ['start_station_id', 'end_station_id']]    
    # Create two columns to store the group ids for start and end stations.
    s2s['start_group'] = None
    s2s['end_group'] = None
    
    # Create new columns of group ids for the start station and the end station.
    group_id = 0
    for group in id_groups:
        s2s.loc[s2s.start_station_id.isin(group), 'start_group'] = group_id
        s2s.loc[s2s.end_station_id.isin(group), 'end_group'] = group_id
        group_id += 1
    
    # g2g = s2s[s2s.start_group != s2s.end_group][['start_group', 'end_group']] # One cluster to another cluster.
    g2g = s2s.loc[s2s.start_group != s2s.end_group, ['start_group', 'end_group']] # One cluster to another cluster.
    g2g_size = g2g.groupby(['start_group', 'end_group']).size() # The number of trips between groups.
    print("The percentage of trips between station clusters is {:.3%}.".format(len(g2g)/len(trip)))
    
    # Draw a group of bar charts.
    fig, axes = plt.subplots(1, n_cluster, figsize=(20, 5))    
    for i in range(n_cluster):
        x_ind = np.arange(g2g_size[i].size)
        axes[i].bar(x_ind, g2g_size[i], color=color)
        axes[i].set_xticks(x_ind)
        axes[i].set_xticklabels(g2g_size[i].index)
        axes[i].set_xlabel('Start station group %s' % i)
    
    plt.show()
    # The index of the group-to-group is returned. Those data will be deleted.
    return g2g.index
In [30]:
# Investigate group-to-group trips with 3 groups.
cluster3_index = cluster2cluster(kmeans_3.labels_, n_cluster=3, color='b')
The percentage of trips between station clusters is 0.011%.

The number of trips between the three bigger clusters are negligible compared to the total number of trips. The trips between San Jose to San Francisco are even fewer. This makes a lot of sense as few people can ride bikes for such a long trip.

In [31]:
# Investigate group-to-group trips with 5 groups.
cluster5_index = cluster2cluster(kmeans_5.labels_, n_cluster=5, color='g')
The percentage of trips between station clusters is 0.156%.
In [32]:
# Verify the group is assigned correctly. 
station.loc[:, 'group'] = kmeans_5.labels_
station[['group','name']].head(3)
Out[32]:
group name
0 1 San Jose Diridon Caltrain Station
1 1 San Jose Civic Center
2 1 Santa Clara at Almaden
In [33]:
# There are five unique zip codes from the weather data.
weather.zip_code.unique()
Out[33]:
array([94107, 94063, 94301, 94041, 95113])
In [34]:
# Arrange region zip codes from north to south
regions = [94107, 94063, 94301, 94041, 95113]
print(regions)
[94107, 94063, 94301, 94041, 95113]

The five zip codes correspond to: 94107: San Francisco; 94063: Redwood City; 94301: Palo Alto; 94041: Mountain View; 95113: San Jose. The weather condition has influence on people's decision whether to ride a bike or not, so it is very important. The five areas defined in the weather data match the station groups when the stations are clustered into five. It makes great sense to cluster the stations by five groups instead of three to better use the weather data and facilitate the machine learning technique.

In [35]:
# Group the station data by the group id and then sort the averaged longitude values in ascending order, which suggest from north to south.
north_to_south_groups = station.groupby('group')['long'].mean().sort_values()
north_to_south_idx = north_to_south_groups.index
print(north_to_south_groups)
group
0   -122.401517
4   -122.225626
2   -122.153588
3   -122.089002
1   -121.891606
Name: long, dtype: float64
In [36]:
# Create a mapping from the group ids to zip codes.
region_dict = dict(zip(north_to_south_idx, regions))
print(region_dict)
{0: 94107, 4: 94063, 2: 94301, 3: 94041, 1: 95113}
In [37]:
# Associate the zip codes with the cities.
region_names = dict(zip(regions, ['San Francisco', 'Redwood City', 'Palo Alto', 'Mountain View', 'San Jose']))
print(region_names)
{94107: 'San Francisco', 94063: 'Redwood City', 94301: 'Palo Alto', 94041: 'Mountain View', 95113: 'San Jose'}
In [38]:
# Create a new column of regions for trip data.
id_groups = [station['id'][kmeans_5.labels_ == i].values for i in range(5)]

trip['station_region'] = None
region_stations_dict = {} # Initiate a dictionary from the region code to stations.
group_id = 0
for group in id_groups:
    region_code = region_dict[group_id]
    region_idx = trip.start_station_id.isin(group)
    
    region_stations_dict[region_code] = group
    trip.loc[region_idx, 'station_region'] = region_code
    group_id += 1
In [39]:
# Investigate the percentage of inter-group trips among the groups. The group is assigned to the start station.
inter_group_trips = trip.station_region.loc[cluster5_index].value_counts()
region_trips = trip.station_region.value_counts()
inter_group_ratio = inter_group_trips / region_trips * 100
In [40]:
for region in region_trips.index:
    print('The percentage of trips from {} to a different region is {:.2f}% out of total {} trips.'.format(region_names[region], inter_group_ratio[region], region_trips[region]))
The percentage of trips from San Francisco to a different region is 0.00% out of total 603708 trips.
The percentage of trips from San Jose to a different region is 0.06% out of total 37878 trips.
The percentage of trips from Mountain View to a different region is 2.32% out of total 18167 trips.
The percentage of trips from Palo Alto to a different region is 7.09% out of total 6773 trips.
The percentage of trips from Redwood City to a different region is 3.03% out of total 3433 trips.
In [41]:
# Investigate how many inter-group trips are made by a subscriber or a non-subscriber
trip.subscription_type.loc[cluster5_index].value_counts()
Out[41]:
Customer      532
Subscriber    510
Name: subscription_type, dtype: int64

The inter-group trips are not negligible for some regions, especially for the three stations that are geographically close to each other. So the inter-group trips will be deleted. A new column will not be created to indicate whether a trip is inter-group or not. Although there are about the same number of subscribers and non-subscribers for inter-group trips, labeling an inter-group trip can still be useful to find out different patterns for different subscription types when other features are taken into account.

In [42]:
# Create a new column to label a trip as an inter-group trip.
trip['inter_group'] = 0
trip.loc[cluster5_index, 'inter_group'] = 1
In [43]:
# Create a new column in station to cluster the stations into groups
station['region'] = None
for region, group in region_stations_dict.items():
    station.loc[station.id.isin(group), 'region'] = region

2.2 Hourly trip counts for different subscription types.

In [44]:
# trip_date = trip['start_date'].str.split(' |/|:')
# Separate the date and time. Only the start date is important. Anyway, the number of trips over 24 hours is rare.

trip_datetime = trip['start_date'].apply(lambda x: x.split())
trip_date = trip_datetime.apply(lambda x: datetime.strptime(x[0], "%m/%d/%Y").date())
trip_time = trip_datetime.apply(lambda x: x[1])
trip_hour = trip_time.apply(lambda x: int(x.split(':')[0]))
trip['hour'] = trip_hour
In [45]:
# To expedite the process, the judgement of whether a day is a business day will be applied on unique days and then mapped to all rows in trip data.
trip_date_unique = pd.Series(trip_date.unique())
trip_BDay_unique = trip_date_unique.apply(lambda x: isbday(x, holidays=holidays.US()))

trip_date_dict = dict(zip(trip_date_unique, trip_BDay_unique))
trip_BDay = trip_date.apply(lambda x: trip_date_dict[x])
In [46]:
# Create a new column in trip data for business day judgement.
trip['date'] = trip_date
trip['BDay'] = trip_BDay
print('There are {} business days in the record.'.format(len(trip_date[trip_BDay==True].unique())))
print('There are totally {} days in the record.'.format(len(trip_date.unique())))
There are 503 business days in the record.
There are totally 733 days in the record.
In [47]:
# Define a function to visualize the number of trips per hour in order to find a reasonable way to divide the day.
def trip_hour_freq(trip_hour, trip_date, trip_day_idx, data_idx, ax, xlabel):
    # Slice the trip_hour, count each value, sort the count series by index and then plot it.
    trip_counts = trip_hour[trip_day_idx & data_idx].value_counts().sort_index() # Calculate the number of trips for each hour
    freq_day = trip_counts/len(trip_date[trip_day_idx].unique()) # Calculate the frequency per day
    ax.bar(freq_day.index, freq_day) # Plot the trip hours.
    ax.set_xlabel(xlabel)
    ax.set_ylabel('Trip Frequency Per Day')
    ax.set_xticks(freq_day.index) # Display all the xticks, which are 24 hours.
In [48]:
# Counts for subscribers and non-subscribers
trip['subscription_type'].value_counts()
Out[48]:
Subscriber    566746
Customer      103213
Name: subscription_type, dtype: int64
In [49]:
# Show bar charts for user behaviors on during business days and rest days.
fig, axes = plt.subplots(3, 2, figsize=(20,10))

trip_hour_freq(trip_hour, trip_date, trip_BDay==True, trip.subscription_type=='Subscriber', axes[0][0], 'Subscriber on Business Day')
trip_hour_freq(trip_hour, trip_date, trip_BDay==False, trip.subscription_type=='Subscriber', axes[0][1], 'Subscriber on non-Business Day')

trip_hour_freq(trip_hour, trip_date, trip_BDay==True, trip.subscription_type=='Customer', axes[1][0], 'Customer on Business Day')
trip_hour_freq(trip_hour, trip_date, trip_BDay==False, trip.subscription_type=='Customer', axes[1][1], 'Customer on non-Business Day')

trip_hour_freq(trip_hour, trip_date, trip_BDay==True, True, axes[2][0], 'All on Business Day')
trip_hour_freq(trip_hour, trip_date, trip_BDay==False, True, axes[2][1], 'All on non-Business Day')

plt.show()

In fact, there are not too many trips per day. The dominant usage is from subscribers for business days, i.e. for work. For work, there are two main peaks, one at 8 am and the other at 5 pm. The division point will be set to (8 + 17)/2 = 12.5. Subscribers tend to use the sharebike for commuting to work place, while non-subscribers tend to use that for lunches, etc. Interestingly, the behavior of non-subscribers during weekends and holidays is almost a perfect Gaussian shape peaking between 1-2 pm.

2.3 Trip counts for different weekdays.

In [50]:
# Create a new column to record weekdays
trip['weekday'] = pd.to_datetime(trip['date']).dt.weekday
In [51]:
# Plot the trip counts for each weekday. 0 is Monday and 6 is Sunday.
trip.groupby('weekday')['id'].count().plot('bar')
plt.show()

2.4 Monthly trip counts for different description types to reveal the seasonal patterns.

In [52]:
# Create a new column for month.
trip['month'] = trip_datetime.apply(lambda x: int(x[0].split('/')[0]))
In [53]:
# Group the trip first by subscription type and then by month.
trip_monthly = trip.groupby(['subscription_type', 'month'])['id'].count()
In [54]:
# View the number of trips on each month.
fig, axes = plt.subplots(1, 2, figsize=(15, 5))
axes[0].bar(trip_monthly['Subscriber'].index, trip_monthly['Subscriber'])
axes[0].set_xlabel('Month (Subscriber)')
axes[1].bar(trip_monthly['Customer'].index, trip_monthly['Customer'])
axes[1].set_xlabel('Month (Customer)')
plt.show()

Interestingly, there is no significant variation on the number of monthly trips for subscribers except the holiday season (December); for non-subscribers we observe a clear trend of increase from the beginning of the year peaking at September and then decrease: this suggests that summer is generally a good time to bike in Bay Area for fun and September is the best season for tourists to visit Bay Area?

2.5 Hourly trip counts for each station group.

In [55]:
# Group the trips first by region and then by hour.
trip_groupby_station = trip.groupby(['station_region', 'hour'])['id'].count()
In [56]:
# Set the hour out as a feature column. Keep the station region as the index.
trip_station_region = trip_groupby_station.reset_index(level=1)
In [57]:
# The station groups in the trip data match those in the weather data.
print(trip_groupby_station.index.levels[0])
print(regions)
Int64Index([94041, 94063, 94107, 94301, 95113], dtype='int64', name='station_region')
[94107, 94063, 94301, 94041, 95113]
In [58]:
trip.groupby(['station_region'])['id'].count()
Out[58]:
station_region
94041     18167
94063      3433
94107    603708
94301      6773
95113     37878
Name: id, dtype: int64
In [59]:
# Plot the trips per hour for each city.
fig, axes = plt.subplots(1, 5, figsize=(20,5))
i = 0
for region in regions:
    region_counts = trip_station_region.loc[region]
    axes[i].bar(region_counts.hour, region_counts.id, color='b')
    axes[i].set_xlabel(region_names[region])
    i += 1
plt.show()

As we can see from the above figure, the number of trips in San Francisco is way larger than the other regions. Even the city ranking the second only has a fraction number of trips of it. This is going to skew the data analysis for trips in cities other than San Francisco, so it is necessary to divide the trips into different groups and treat them separately.

2.6 Bike durations exploration.

In [60]:
# Get the mean trip duration for each region in minute.
bike_durations = trip.groupby('station_region')['duration'].mean()/60
bike_durations_median = trip.groupby('station_region')['duration'].median()/60
trip_region_counts = trip.groupby('station_region')['id'].count()
In [61]:
for region, duration in bike_durations.iteritems():
    print('There are {0} trips in {1}, lasting {2:.1f} minutes in average.'.format(trip_region_counts[region], region_names[region], duration))
There are 18167 trips in Mountain View, lasting 29.9 minutes in average.
There are 3433 trips in Redwood City, lasting 40.7 minutes in average.
There are 603708 trips in San Francisco, lasting 17.1 minutes in average.
There are 6773 trips in Palo Alto, lasting 71.3 minutes in average.
There are 37878 trips in San Jose, lasting 23.0 minutes in average.
In [62]:
for region, duration in bike_durations_median.iteritems():
    print('There are {0} trips in {1}, lasting {2:.1f} minutes in average.'.format(trip_region_counts[region], region_names[region], duration))
There are 18167 trips in Mountain View, lasting 5.0 minutes in average.
There are 3433 trips in Redwood City, lasting 6.3 minutes in average.
There are 603708 trips in San Francisco, lasting 8.8 minutes in average.
There are 6773 trips in Palo Alto, lasting 15.1 minutes in average.
There are 37878 trips in San Jose, lasting 7.7 minutes in average.
In [63]:
bike_durations.index = bike_durations.index.to_series().map(region_names)
bike_durations_median.index = bike_durations.index
In [64]:
plt.figure(figsize=(10, 5))

plt.subplot(121)
bike_durations.plot.bar()
plt.ylabel('Average Trip Duration (min)')

plt.subplot(122)
bike_durations_median.plot.bar()
plt.ylabel('Median Trip Duration (min)')
plt.show()
In [65]:
trip_duration = trip.duration.sort_values()
In [66]:
div = 10 # Set the number of pieces to divide the trip_duration
trip_duration_idx = np.linspace(0, len(trip_duration), num=div, endpoint=False, dtype='int')
division_points = trip_duration.iloc[trip_duration_idx[1:]]/60
print("All trips can be divided into {} pieces with minute division points at:".format(div))
print(['%.1f' % elem for elem in division_points])
All trips can be divided into 10 pieces with minute division points at:
['4.0', '5.2', '6.3', '7.4', '8.6', '9.9', '11.6', '13.9', '19.2']

As we can see above, most of the trips are pretty short. More than 90% of the trips are shorter than 20 minutes.

In [67]:
trip_duration_subscriber = trip_duration[trip.subscription_type=='Subscriber'].mean()/60
trip_duration_customer = trip_duration[trip.subscription_type=='Customer'].mean()/60

print('The average trip duration for a subscriber is %.1f minutes.' % trip_duration_subscriber)
print('The average trip duration for a non-subscriber is %.1f minutes.' % trip_duration_customer)
The average trip duration for a subscriber is 9.8 minutes.
The average trip duration for a non-subscriber is 65.9 minutes.

A non-subscriber tends to have a much longer trip than a subscriber. This is reasonable as most subscribers probably use sharebikes for work commuting while non-subscribers use it for leisure purposes.

2.7 Bike usage exploration.

In [68]:
# Calculate the bike usage based on the bike id.
bike_usage = trip.bike_id.value_counts()
In [69]:
# It would be interesting to figure out where the most used bikes come from and the potential lifetime of the bike
plt.figure(figsize=(15,6))
plt.plot(bike_usage.index, bike_usage, '.')
plt.ylabel("Number of Trips")
plt.xlabel("Bike ID")
plt.title("Bike Usage View")
plt.show()
In [70]:
# Get the bike ids 
most_used_bikes = bike_usage[bike_usage > 1500].index
least_used_bikes = bike_usage[bike_usage < 500].index
In [71]:
# Get the bike trips
most_used_bikes_trips = trip[trip.bike_id.isin(most_used_bikes)]
least_used_bikes_trips = trip[trip.bike_id.isin(least_used_bikes)]
In [72]:
# Group by the station regions
most_used_bikes_group = most_used_bikes_trips.groupby('station_region')['bike_id'].count()
least_used_bikes_group = least_used_bikes_trips.groupby('station_region')['bike_id'].count()
In [73]:
# Change the index from zip code to city name
most_used_bikes_group.index = most_used_bikes_group.index.to_series().map(region_names)
least_used_bikes_group.index = least_used_bikes_group.index.to_series().map(region_names)
In [74]:
plt.figure(figsize=(15, 5))

plt.subplot(121)
most_used_bikes_group.plot.bar(logy=True)
plt.ylabel('Logarithm of Trip Counts')
plt.title('Trips with Most Used Bikes')

plt.subplot(122)
least_used_bikes_group.plot.bar()
plt.ylabel('Trip Counts')
plt.title('Trip Counts with Least Used Bikes')
plt.show()
In [75]:
station.head()
Out[75]:
id name lat long dock_count city installation_date group region
0 2 San Jose Diridon Caltrain Station 37.329732 -121.901782 27 San Jose 8/6/2013 1 95113
1 3 San Jose Civic Center 37.330698 -121.888979 15 San Jose 8/5/2013 1 95113
2 4 Santa Clara at Almaden 37.333988 -121.894902 11 San Jose 8/6/2013 1 95113
3 5 Adobe on Almaden 37.331415 -121.893200 19 San Jose 8/5/2013 1 95113
4 6 San Pedro Square 37.336721 -121.894074 15 San Jose 8/7/2013 1 95113
In [76]:
# Calculate the number of docks in each region
region_dock_counts = station.groupby('region')['dock_count'].sum()
print(region_dock_counts)
region
94041    117
94063    115
94107    665
94301     75
95113    264
Name: dock_count, dtype: int64
In [77]:
region_dock_counts.index = region_dock_counts.index.to_series().map(region_names)
In [78]:
least_used_bikes_group / region_dock_counts
Out[78]:
station_region
Mountain View    150.025641
Redwood City      28.817391
San Francisco      9.442105
Palo Alto         87.600000
San Jose         137.818182
dtype: float64

2.8 Local zip code investigation.

In [79]:
# Fill NaN with an empty string.
trip.zip_code.fillna('', inplace=True)
In [80]:
# Display bay area with 3-digit zip codes.
display(Image(filename='Zipcode_Map.png'))
In [81]:
# Sectional Center Facility: 
# Obtain zip codes (first three digits) for bay area from online source: http://maps.huge.info/zip3.htm
zip3 = ['948', '945', '947', '946', '941', '944', '940', '943', '950']
In [82]:
# Create a new column with True and False values indicating whether the rider is from Bay Area or not (roughly).
trip['local_zip'] = trip.zip_code.apply(lambda x: x[0:3] in zip3 and len(x)>=5)
In [83]:
# Calculate the ratio of the local zip number and the total zip number
local_percentage = trip.groupby('subscription_type')['local_zip'].sum() / trip.subscription_type.value_counts() * 100
print('{:.2f}% of the subscribers are local people.'.format(local_percentage[1]))
print('{:.2f}% of the non-subscribers are local people.'.format(local_percentage[0]))
89.10% of the subscribers are local people.
29.28% of the non-subscribers are local people.

This is as expected. Most subscribers are local residents, while most of the non-subscribers are tourists.

2.8 Trip trend over time

In [84]:
# Obtain the daily trips groupby the business day column
daily_trips = trip.groupby('date').agg({'BDay': ['median', 'count']})
In [85]:
# Separate the business and non-business day trips
BDay_trips = daily_trips[daily_trips['BDay']['median'] == True].reset_index()
nonBDay_trips = daily_trips[daily_trips['BDay']['median'] == False].reset_index()
In [86]:
# Plot the trip counts per day 
fig, ax = plt.subplots(figsize=(20, 8))
plt.plot(BDay_trips.date, BDay_trips['BDay']['count'], 'bo', label='Business Day')
plt.plot(nonBDay_trips.date, nonBDay_trips['BDay']['count'], 'rx', label='Non-business Day')

# Set the x axis so that every month will be displayed
months = mdates.MonthLocator()
year_month_Fmt = mdates.DateFormatter('%y/%m')
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(year_month_Fmt)

plt.legend()
plt.ylabel('Trip Counts')
ax.grid(True)


plt.show()
In [87]:
# Find out the outliers of the non-business day trips. Four are found on the above graph.
nonBDay_outliers = nonBDay_trips.loc[nonBDay_trips['BDay'].sort_values('count').iloc[-4:].index]
print(nonBDay_outliers)
           date   BDay      
                median count
24   2013-11-11  False   986
15   2013-10-14  False  1050
138  2014-11-11  False  1213
129  2014-10-13  False  1261
In [88]:
nonBDay_outliers.date
Out[88]:
24     2013-11-11
15     2013-10-14
138    2014-11-11
129    2014-10-13
Name: date, dtype: object
In [89]:
# https://pypi.python.org/pypi/holidays
# Apparently some companies don't have Veterans Day and Columbus Day off, and this seems to be a common situation.
us_holidays = holidays.US()
[us_holidays.get_list(hol) for hol in nonBDay_outliers.date.tolist()]
Out[89]:
[['Veterans Day'], ['Columbus Day'], ['Veterans Day'], ['Columbus Day']]
In [90]:
# Exclude these two holidays from the non-business days.
trip.loc[trip.date.isin(nonBDay_outliers.date), 'BDay'] = True
In [91]:
# Data before 2013/10 should be treated differently
before_Oct2013 = np.count_nonzero(trip.date < datetime(2013, 10, 1).date())
print("Percentage of trips happening before 2013/10 is {:.2%}".format(before_Oct2013/len(trip)))
Percentage of trips happening before 2013/10 is 4.08%
In [92]:
# Clearly the patterns before and after october are different. In fact, I can just delete the data, but it is also fine to just label it.
trip['before_Oct2013'] = trip.date < datetime(2013, 10, 1).date()

It is also noticeable that the business day trips are much less during December. December is considered the holiday season in the US. This means that the month can be an important feature.

2.9 Check if there is nan value in trip data

In [93]:
# There are no nan values in the trip data.
trip.isna().sum()
Out[93]:
id                    0
duration              0
start_date            0
start_station_name    0
start_station_id      0
end_date              0
end_station_name      0
end_station_id        0
bike_id               0
subscription_type     0
zip_code              0
station_region        0
inter_group           0
hour                  0
date                  0
BDay                  0
weekday               0
month                 0
local_zip             0
before_Oct2013        0
dtype: int64

3. Exploration on 'status' data.

3.1 Total docks.

In [94]:
# Create a dictionary of station to its originally designed dock counts.
# Create two more columns for the status data: the originally designed dock counts and the real number of docks.
station_docks = dict(zip(station.id, station.dock_count))
status['dock_count'] = status.station_id.apply(lambda x: station_docks[x])
status['total'] = status['bikes_available'] + status['docks_available']

3.1.1 More docks.

In [95]:
# Check situations when the sum of bikes and docks is larger than the originally designed number of docks.
more_docks = status.loc[status.total - status.dock_count > 0, :]
more_docks.groupby(['station_id', 'total']).count()
Out[95]:
bikes_available docks_available time dock_count
station_id total
22 26 237 237 237 237
27 9547 9547 9547 9547
39 20 2 2 2 2
21 186 186 186 186

As the above tables shows, only two stations have the total number of docks larger than the originally designed dock counts. Close to 10000 entries in the status data has dock count to be 27, which is pretty suspicious. Below we will explore further whether the total number of docks indeed changes or just system malfunction.

In [96]:
# Show both the head and the tail of station 22. 
# It shows that the number of docks for station 22 is actually 27 in the beginning and the end. The number of dock count for station 22 needs to be updated.
status[status.station_id == 22].iloc[np.r_[0:5, -5:0]]
Out[96]:
station_id bikes_available docks_available time dock_count total
322607 22 13 14 2013/08/29 12:21:01 25 27
322608 22 14 13 2013/08/29 12:39:01 25 27
322609 22 13 14 2013/08/29 12:45:02 25 27
322610 22 12 15 2013/08/29 12:48:01 25 27
322611 22 13 14 2013/08/29 17:49:01 25 27
332809 22 13 14 2015-08-31 17:42:01 25 27
332810 22 17 10 2015-08-31 18:17:02 25 27
332811 22 18 9 2015-08-31 18:55:03 25 27
332812 22 17 10 2015-08-31 19:08:02 25 27
332813 22 16 11 2015-08-31 23:59:02 25 27
In [97]:
# Show both the head and the tail of station 39.
# It shows that the number of docks for station 39 is 19 in the beginning and the end. No change. The larger count could be just a system error.
status[status.station_id == 39].iloc[np.r_[0:5, -5:0]]
Out[97]:
station_id bikes_available docks_available time dock_count total
917017 39 7 12 2013/08/29 12:41:03 19 19
917018 39 6 13 2013/08/29 13:17:01 19 19
917019 39 7 12 2013/08/29 13:23:01 19 19
917020 39 8 11 2013/08/29 13:44:02 19 19
917021 39 7 12 2013/08/29 13:45:01 19 19
957027 39 5 14 2015-08-31 19:39:02 19 19
957028 39 6 13 2015-08-31 19:45:02 19 19
957029 39 7 12 2015-08-31 20:06:02 19 19
957030 39 8 11 2015-08-31 20:14:02 19 19
957031 39 9 10 2015-08-31 23:59:02 19 19
In [98]:
# Update the station 22 dock count to 27
station_22_index = (station.id == 22).nonzero()[0][0]
station.loc[station_22_index, 'dock_count'] = 27
station_docks = dict(zip(station.id, station.dock_count))
status['dock_count'] = status.station_id.apply(lambda x: station_docks[x])
In [99]:
# Check situations when the sum of bikes and docks is larger than the originally designed number of docks.
more_docks = status[status.total - status.dock_count > 0]
more_docks.groupby(['station_id', 'total']).count()
Out[99]:
bikes_available docks_available time dock_count
station_id total
39 20 2 2 2 2
21 186 186 186 186

3.1.2 Less docks.

Find those stations when the total dock counts keep changing. This may have something to do with the vandalism: https://www.theguardian.com/us-news/2017/aug/21/bike-sharing-scheme-san-francisco-gentrification-vandalism

In [100]:
# Count the number of status changes when the sum of available bikes and docks is smaller than the actual dock count, which shouldn't change in the two years' operation.
less_docks = status.loc[status.total - status.dock_count < 0, :]
less_docks_counts = less_docks.groupby(['station_id'])['total'].count()
less_docks_station_id = less_docks_counts.index
In [101]:
fig, ax = plt.subplots(figsize=(15, 5))
for region, color in zip(region_stations_dict, ['b', 'r', 'k', 'g', 'y']):
    station_docks_count = less_docks_counts.loc[region_stations_dict[region]]
    ax.bar(station_docks_count.index, station_docks_count, color=color, label=region_names[region])

plt.xlabel('Station ID')
plt.ylabel('Status Changes')
plt.title('Counts of status changes on suspected dock malfunction')
plt.xticks(range(0, max(less_docks_station_id)+1, 2))
plt.legend()
plt.show()
In [102]:
# Demonstrate how to obtain the hour of the status data.
print(less_docks.time.iloc[1])
print(int(less_docks.time.iloc[1].split()[1].split(':')[0]))
2013/10/18 08:14:01
8
In [103]:
less_docks['hour'] = less_docks.time.apply(lambda x: int(x.split()[1].split(':')[0]))
/home/radiowei/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
In [104]:
less_docks_counts_hour = less_docks.groupby('hour')['time'].count()

plt.figure(figsize=(10, 5))
less_docks_counts_hour.plot.bar(color='blue')
plt.ylabel('Counts of dock counts discrepancy')
plt.show()

Although the dock malfunction may indeed happen possible due to vandalism, it is also possible that the dock status may not be well reflected when frequent change of status happends in less than one minute.

3.1.3 Total dock changes

In [105]:
status['hour'] = status.time.apply(lambda x: int(x.split()[1].split(':')[0]))
In [106]:
%%time
status['datetime'] = pd.to_datetime(status.time)
status['date'] = status.datetime.apply(lambda x: x.date())
status['weekday'] = status.datetime.dt.weekday
CPU times: user 14.2 s, sys: 284 ms, total: 14.5 s
Wall time: 14.6 s
In [107]:
# Get the total dock count change events per station
station_status_list = []
for station_id in status.station_id.unique():
    station_status = status[status.station_id == station_id]
    station_status_list.append(station_status[station_status.total.diff() != 0])
In [108]:
# Concatenate the list to form the full dataframe
dock_changes = pd.concat(station_status_list)
print(dock_changes.shape)
print('The total dock count change events occurs in {:.2f}% of the total status changes.'.format(len(dock_changes) / len(status) * 100))
(11872, 10)
The total dock count change events occurs in 0.57% of the total status changes.
In [109]:
plt.figure(figsize=(10, 5))
dock_changes.groupby('hour')['total'].count().plot.bar(color='green')
plt.ylabel('Counts of total dock count change events')
plt.show()

3.2 Extra trips in status data

In [110]:
# Get the number of station status changes from the trip data. Start suggests the decrease of available bikes. End suggests the increase.
trip_status_start = trip.loc[:, ['start_date', 'weekday']]
trip_status_end = trip.loc[:, ['end_date', 'weekday']]

trip_status_start['start_hour'] = trip_status_start.loc[:, 'start_date'].apply(lambda x: int(x.split()[1].split(':')[0]))
trip_status_end['end_hour'] = trip_status_end.loc[:, 'end_date'].apply(lambda x: int(x.split()[1].split(':')[0]))
In [111]:
# Calculate the event counts per hour for trip data
trip_status_start_count = trip_status_start.groupby(['weekday', 'start_hour'])['start_hour'].count()
trip_status_end_count = trip_status_end.groupby(['weekday', 'end_hour'])['end_hour'].count()
In [112]:
# Calculate the change of number of the available bikes
status['bike_diff'] = status.bikes_available.diff()
In [113]:
# Calculate the bike count increase and decrease in status data
bike_incr = status.loc[status.bike_diff > 0, :]
bike_decr = status.loc[status.bike_diff < 0, :]
In [114]:
# Calculate the event counts per hour for status data
bike_incr_count = bike_incr.groupby(['weekday', 'hour'])['bike_diff'].sum()
bike_decr_count = - bike_decr.groupby(['weekday', 'hour'])['bike_diff'].sum() # All bike_diff values are negative. Need to be negated.
In [115]:
# Calculate the extra trips in status data than in trip data
bike_incr_diff = bike_incr_count - trip_status_end_count
bike_decr_diff = bike_decr_count - trip_status_start_count
In [116]:
bike_incr_diff_reset = bike_incr_diff.reset_index()
bike_decr_diff_reset = bike_decr_diff.reset_index()

bike_incr_diff_reset.columns = ['weekday', 'hour', 'counts']
bike_decr_diff_reset.columns = ['weekday', 'hour', 'counts']
In [117]:
number_of_weeks = len(trip.date.unique()) / 7
bike_incr_diff_reset.counts = bike_incr_diff_reset.counts / number_of_weeks # Calculate the averaged daily extra trips
bike_decr_diff_reset.counts = bike_decr_diff_reset.counts / number_of_weeks # Calculate the averaged daily extra trips

# Convert into a 2D table.
bike_incr_diff_pivot = bike_incr_diff_reset.pivot(index='hour', columns='weekday', values='counts')
bike_decr_diff_pivot = bike_decr_diff_reset.pivot(index='hour', columns='weekday', values='counts')
In [118]:
# Generate the contour plot.
X = bike_incr_diff_pivot.columns.values
Y = bike_incr_diff_pivot.index.values
Z_incr = bike_incr_diff_pivot.values
Z_decr = bike_decr_diff_pivot.values

Xi,Yi = np.meshgrid(X, Y)
plt.figure(figsize=(15,10))

plt.subplot(211)
plt.xlabel('Hour')
plt.ylabel('Weekday')
plt.title('Extra trips when the number of bikes increases')
plt.xticks(range(0, 24, 1))
plt.yticks(range(0, 7, 1))
plt.contourf(Yi, Xi, Z_incr, 300, cmap=plt.cm.jet)
plt.colorbar()

plt.subplot(212)
plt.xlabel('Hour')
plt.ylabel('Weekday')
plt.title('Extra trips when the number of bikes decreases')
plt.xticks(range(0, 24, 1))
plt.yticks(range(0, 7, 1))
plt.contourf(Yi, Xi, Z_decr, 300, cmap=plt.cm.jet)
plt.colorbar()

plt.show()
In [119]:
print('There are {:.1f}% more trips with bike count increase in status data.'.format((bike_incr.bike_diff.sum() - len(trip_status_end)) / len(trip_status_end) * 100))
print('There are {:.1f}% more trips with bike count decrease in status data.'.format((- bike_decr.bike_diff.sum() - len(trip_status_start)) / len(trip_status_start) * 100))
There are 73.7% more trips with bike count increase in status data.
There are 73.7% more trips with bike count decrease in status data.
In [120]:
# Clean up the status data so that only available bike changes are used. It contributes more than 99% to the data.
bike_avail_changes = status[status.bikes_available.diff() != 0]
len(bike_avail_changes) / len(status)
Out[120]:
0.9985898349648319
View only on the hour axis.
In [121]:
bike_incr_diff_hour = bike_incr_diff_reset.groupby('hour')['counts'].sum()
bike_decr_diff_hour = bike_decr_diff_reset.groupby('hour')['counts'].sum()
In [122]:
# Plot the feature importances from the daily, am, pm trip models
# Matplotlib code borrowed from: https://chrisalbon.com/python/matplotlib_grouped_bar_plot.html
X_tick_labels = bike_incr_diff_hour.index
pos = np.array(range(len(X_tick_labels)))

plt.figure(figsize=(20, 7))
ax = plt.subplot(111)
width = 0.25
ax.bar(pos-0.5*width, bike_incr_diff_hour, width=width, color='#7BC024', label='Bike Count Increase')
ax.bar(pos+0.5*width, bike_decr_diff_hour, width=width, color='#17C9BF', label='Bike Count Decrease')
ax.legend()

ax.set_xlabel('Hour')
ax.set_ylabel('Extra status changes')
ax.set_xticks(pos)
ax.set_xticklabels(X_tick_labels)

# Adding the legend and showing the plot
plt.legend(loc='upper right')
plt.grid()
plt.show()

3.3 When are stations empty?

In [123]:
%%time
# Read all the status data that have zero bikes available.
conn = sqlite3.connect('database.sqlite')
cursor = conn.cursor()
no_bikes = pd.read_sql_query(f"SELECT * FROM status WHERE bikes_available = 0 ORDER BY time", con=conn)
conn.close()
CPU times: user 6.25 s, sys: 3.16 s, total: 9.41 s
Wall time: 10.8 s
In [124]:
no_bikes.head()
Out[124]:
station_id bikes_available docks_available time
0 4 0 11 2013/08/29 12:06:01
1 7 0 15 2013/08/29 12:06:01
2 13 0 15 2013/08/29 12:06:01
3 70 0 19 2013/08/29 12:06:01
4 4 0 11 2013/08/29 12:07:01
In [125]:
# Get the business day
no_bikes['datetime'] = pd.to_datetime(no_bikes.time)
no_bikes['date'] = no_bikes.datetime.apply(lambda x: x.date())
no_bikes['hour'] = no_bikes.time.apply(lambda x: int(x.split()[1].split(':')[0])) # Get the hours in a day.

no_bikes_date_unique = pd.Series(no_bikes.date.unique())
no_bikes_BDay_unique = no_bikes_date_unique.apply(lambda x: isbday(x, holidays=holidays.US()))

no_bikes_date_dict = dict(zip(no_bikes_date_unique, no_bikes_BDay_unique))
no_bikes['BDay'] = no_bikes.date.apply(lambda x: no_bikes_date_dict[x])
In [126]:
# Assign the regions to stations.
no_bikes['station_region'] = None
no_bikes_region_dict = {} # Initiate a dictionary from the region code to stations.
station_num_dict = {}
group_id = 0

for group in id_groups:
    region_code = region_dict[group_id]
    region_idx = no_bikes.station_id.isin(group)
    
    no_bikes_region_dict[region_code] = group    
    station_num_dict[region_code] = len(group)
    
    no_bikes.loc[region_idx, 'station_region'] = region_code
    group_id += 1
In [127]:
# Divide the group into BDay and non-BDay
no_bikes_BDay = no_bikes.loc[no_bikes.BDay == True, :]
no_bikes_nonBDay = no_bikes.loc[no_bikes.BDay == False, :]
In [128]:
# Calculate the empty minutes per hour per region in a single day.
number_of_BDays = no_bikes_BDay_unique.sum() # Calculate the number of business days.
number_of_nonBDays = len(no_bikes_date_unique) - number_of_BDays

region_hour_BDay = no_bikes_BDay.groupby(['station_region', 'hour'])['bikes_available'].count() / number_of_BDays
region_hour_nonBDay = no_bikes_nonBDay.groupby(['station_region', 'hour'])['bikes_available'].count() / number_of_nonBDays
In [129]:
# Plot empty station events per station per hour.
region_count = 1
plt.figure(figsize=(12, 12))
for region in region_hour_BDay.index.levels[0]:
    plt.subplot(5, 1, region_count) 
    
    # Show the minutes count per station   
    (region_hour_BDay[region] / station_num_dict[region]).plot.bar(label=region_names[region]) 
    
    plt.ylabel('Empty Station (min)')
    plt.xlabel('')
    plt.legend()
    region_count += 1
In [130]:
# Plot empty station events per station per hour.
region_count = 1
plt.figure(figsize=(12, 12))
for region in region_hour_nonBDay.index.levels[0]:
    plt.subplot(5, 1, region_count) 
    
    # Show the per station minutes count    
    (region_hour_nonBDay[region] / station_num_dict[region]).plot.bar(label=region_names[region]) 
    
    plt.ylabel('Empty Station (min)')
    plt.xlabel('')
    plt.legend()
    region_count += 1

3.4 When are stations full?

In [131]:
%%time
# Read all the status data that have zero docks available.
conn = sqlite3.connect('database.sqlite')
cursor = conn.cursor()
station_full = pd.read_sql_query(f"SELECT * FROM status WHERE docks_available = 0 ORDER BY time", con=conn)
conn.close()
CPU times: user 6.62 s, sys: 3.2 s, total: 9.83 s
Wall time: 11.6 s
In [132]:
# The ratio of station full over empty
len(station_full) / len(no_bikes)
Out[132]:
0.6175359732629621
In [133]:
station_full['datetime'] = pd.to_datetime(station_full.time)
station_full['date'] = station_full.datetime.apply(lambda x: x.date())
station_full['hour'] = station_full.time.apply(lambda x: int(x.split()[1].split(':')[0])) # Get the hours in a day.
In [134]:
# Assign the regions to stations.
station_full['station_region'] = None
station_full_region_dict = {} # Initiate a dictionary from the region code to stations.
station_num_dict = {}
group_id = 0

for group in id_groups:
    region_code = region_dict[group_id]
    region_idx = station_full.station_id.isin(group)
    
    station_full_region_dict[region_code] = group    
    station_num_dict[region_code] = len(group)
    
    station_full.loc[region_idx, 'station_region'] = region_code
    group_id += 1
In [135]:
# Calculate the full minutes per hour per region in a single day.
number_of_days = len(station_full.date.unique())

region_hour_full = station_full.groupby(['station_region', 'hour'])['bikes_available'].count() / number_of_days
In [136]:
# Plot full station events per station per hour.
region_count = 1
plt.figure(figsize=(12, 12))
for region in region_hour_full.index.levels[0]:
    plt.subplot(5, 1, region_count) 
    
    # Show the minutes count per station   
    (region_hour_full[region] / station_num_dict[region]).plot.bar(label=region_names[region]) 
    
    plt.ylabel('Full Station (min)')
    plt.xlabel('')
    plt.legend()
    region_count += 1

3.5 Check whether status has NaN values

In [137]:
# Status data don't have nan values.
status.isna().sum()
Out[137]:
station_id         0
bikes_available    0
docks_available    0
time               0
dock_count         0
total              0
hour               0
datetime           0
date               0
weekday            0
bike_diff          1
dtype: int64

4. Exploration on 'weather' data.

In [138]:
weather.head(1)
Out[138]:
date max_temperature_f mean_temperature_f min_temperature_f max_dew_point_f mean_dew_point_f min_dew_point_f max_humidity mean_humidity min_humidity max_sea_level_pressure_inches mean_sea_level_pressure_inches min_sea_level_pressure_inches max_visibility_miles mean_visibility_miles min_visibility_miles max_wind_Speed_mph mean_wind_speed_mph max_gust_speed_mph precipitation_inches cloud_cover events wind_dir_degrees zip_code
0 8/29/2013 74.0 68.0 61.0 61.0 58.0 56.0 93.0 75.0 57.0 30.07 30.02 29.97 10.0 10.0 10.0 23.0 11.0 28.0 0 4.0 NaN 286.0 94107
In [139]:
weather.shape
Out[139]:
(3665, 24)

4.1 Explore the precipitation column

In [140]:
# Precipitation column is not numeric!!
precip = weather.precipitation_inches
precip.dtype
Out[140]:
dtype('O')
In [141]:
# Show unique values in precipitation data
precip.unique()
Out[141]:
array(['0', '0.23', 'T', '0.01', '0.28', '0.63', '0.29', '0.06', '0.85',
       '0.09', '0.64', '0.42', '0.35', '0.43', '0.22', '0.74', '0.03',
       '0.12', '0.16', '0.49', '0.17', '0.08', '0.04', '0.53', '0.07',
       '0.02', '0.83', '1.06', '1.71', '0.37', '0.27', '0.45', '0.78',
       '0.88', '0.66', '0.47', '0.1', '0.61', '0.14', '0.05', '0.68',
       '0.97', '0.26', '0.15', '0.87', '0.57', '0.69', '0.32', '0.21',
       '0.24', '0.52', '0.36', '0.33', '0.25', '0.11', '0.2', '1.18',
       '1.43', '3.12', '0.48', '0.19', '1.09', '0.65', '0.13', '0.91',
       '0.99', '0.18', '0.4', '1.07', nan, '0.41', '0.34', '1.25', '1.85',
       '3.36', '0.71', '1.3', '0.72', '0.6', '0.51', '1.2', '1.28', '3.23',
       '0.55', '1.26', '0.39'], dtype=object)
In [142]:
print('{:.2}% of the weather data are labeled "T" for precipitation.'.format(len(weather[precip=='T'])/len(weather) * 100))
print('{:.2}% of the weather data are labeled "nan" for precipitation.'.format(len(weather[precip.isna()])/len(weather) * 100))
3.9% of the weather data are labeled "T" for precipitation.
0.027% of the weather data are labeled "nan" for precipitation.

The following is quoted from http://www.nws.noaa.gov/om/csd/info/NOWdata/FAQ.php:

Looking at daily data, some dates have an "M" or a &"T" in the field. What does this mean?

"M" stands for "Missing". Data for an element will be missing if the primary sensor for that weather element is inoperable (e.g., has an outage) or malfunctioning (e.g., producing errant data) AND any collocated backup sensor is also inoperable or malfunctioning. "T" stand for "Trace". This is a small amount of precipitation that will wet a raingage but is less than the 0.01 inch measuring limit.

Thus, it is reasonable to assign a small value to 'T', 0.005 inches for example.

In [143]:
weather.loc[precip=='T', 'precipitation_inches'] = 0.005
In [144]:
# Only one entry contains NaN value. This will be kept to analyze later after the merge into the complete datasets.
weather[precip.isna()]
Out[144]:
date max_temperature_f mean_temperature_f min_temperature_f max_dew_point_f mean_dew_point_f min_dew_point_f max_humidity mean_humidity min_humidity max_sea_level_pressure_inches mean_sea_level_pressure_inches min_sea_level_pressure_inches max_visibility_miles mean_visibility_miles min_visibility_miles max_wind_Speed_mph mean_wind_speed_mph max_gust_speed_mph precipitation_inches cloud_cover events wind_dir_degrees zip_code
2911 8/8/2015 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 94301
In [145]:
weather.precipitation_inches = weather.precipitation_inches.astype('float')
print(weather.precipitation_inches.dtype)
float64

4.2 Deal with NaN values in numeric columns

In [146]:
# Summarize the number of NaNs in each column.
weather.isna().sum()
Out[146]:
date                                 0
max_temperature_f                    4
mean_temperature_f                   4
min_temperature_f                    4
max_dew_point_f                     54
mean_dew_point_f                    54
min_dew_point_f                     54
max_humidity                        54
mean_humidity                       54
min_humidity                        54
max_sea_level_pressure_inches        1
mean_sea_level_pressure_inches       1
min_sea_level_pressure_inches        1
max_visibility_miles                13
mean_visibility_miles               13
min_visibility_miles                13
max_wind_Speed_mph                   1
mean_wind_speed_mph                  1
max_gust_speed_mph                 899
precipitation_inches                 1
cloud_cover                          1
events                            3143
wind_dir_degrees                     1
zip_code                             0
dtype: int64

For weather parameters, the best guess can be the average value of the adjacent days. One can argue that the weather may abruptly change during these days, but an assumption of some mild change is still better than just deleting the whole row. However, the max_gust_speed_mph needs to be investigated independently as there are too many nan values.

In [147]:
def replace_nan(data, col):
    col_nan = data.loc[data[col].isna(), col] # Get the nan entries of the specified column
    nan_idx = col_nan.index # Get the index of the col_nan
    
    # Group adjacent indices from col_nan_idx according to the method from the following link:
    # https://stackoverflow.com/questions/2154249/identify-groups-of-continuous-numbers-in-a-list
    for group in mit.consecutive_groups(nan_idx):
        col_idx = list(group)
        
        # Set the mean of the adjacent six values as the value for the NaN
        weather.loc[col_idx, col] = weather.loc[(col_idx[0]-3):(col_idx[-1]+3), col].mean()
In [148]:
# The replace_nan function will be applied to all numeric columns except the gust column
nan_cols = ['max_temperature_f', 'mean_temperature_f', 'min_temperature_f',
       'max_dew_point_f', 'mean_dew_point_f', 'min_dew_point_f',
       'max_humidity', 'mean_humidity', 'min_humidity',
       'max_sea_level_pressure_inches', 'mean_sea_level_pressure_inches',
       'min_sea_level_pressure_inches', 'max_visibility_miles',
       'mean_visibility_miles', 'min_visibility_miles', 'max_wind_Speed_mph',
       'mean_wind_speed_mph', 'precipitation_inches',
       'cloud_cover', 'wind_dir_degrees']
In [149]:
for col in nan_cols:
    replace_nan(weather, col)

Quoted from the following link: http://www.differencebetween.net/science/nature/difference-between-gust-and-wind/

A gust is a sudden increase of the wind’s speed that lasts no more than 20 seconds.

The max gust speed should be highly correlated with the max wind speed.

In [150]:
# If the gust data is missing, it should mean that there is no gust.
# This is somewhat confirmed by the fact that there is no gust value of 0 in the data.
weather.loc[weather.max_gust_speed_mph == 0, 'max_gust_speed_mph']
Out[150]:
Series([], Name: max_gust_speed_mph, dtype: float64)
In [151]:
weather.max_gust_speed_mph.fillna(0, inplace=True) # Replace all the nan values in gust data with 0.
In [152]:
weather.isna().sum()
Out[152]:
date                                 0
max_temperature_f                    0
mean_temperature_f                   0
min_temperature_f                    0
max_dew_point_f                      0
mean_dew_point_f                     0
min_dew_point_f                      0
max_humidity                         0
mean_humidity                        0
min_humidity                         0
max_sea_level_pressure_inches        0
mean_sea_level_pressure_inches       0
min_sea_level_pressure_inches        0
max_visibility_miles                 0
mean_visibility_miles                0
min_visibility_miles                 0
max_wind_Speed_mph                   0
mean_wind_speed_mph                  0
max_gust_speed_mph                   0
precipitation_inches                 0
cloud_cover                          0
events                            3143
wind_dir_degrees                     0
zip_code                             0
dtype: int64

4.3 Find the correlations between features.

The weather data have too many feature columns, some seeming to be correlated. To fight against the curse of dimensionality, the strongly correlated columns will be filtered out.

In [153]:
# Create a correlation table between all the features.
weather_corr_table = weather.corr(method='pearson', min_periods=1)

display(weather_corr_table)
max_temperature_f mean_temperature_f min_temperature_f max_dew_point_f mean_dew_point_f min_dew_point_f max_humidity mean_humidity min_humidity max_sea_level_pressure_inches mean_sea_level_pressure_inches min_sea_level_pressure_inches max_visibility_miles mean_visibility_miles min_visibility_miles max_wind_Speed_mph mean_wind_speed_mph max_gust_speed_mph precipitation_inches cloud_cover wind_dir_degrees zip_code
max_temperature_f 1.000000 0.911044 0.642260 0.595012 0.505098 0.397487 -0.216879 -0.370503 -0.417841 -0.439687 -0.393728 -0.352066 0.117971 0.255346 0.340895 0.085911 0.066185 -0.040446 -0.133768 -0.307077 0.218432 0.110797
mean_temperature_f 0.911044 1.000000 0.895594 0.761181 0.725137 0.638972 -0.128456 -0.153018 -0.139305 -0.576732 -0.531648 -0.472030 0.146534 0.232811 0.260410 0.176291 0.284251 0.013383 -0.073244 -0.040235 0.168688 0.051660
min_temperature_f 0.642260 0.895594 1.000000 0.798905 0.824766 0.780979 -0.019031 0.118853 0.196733 -0.616909 -0.578559 -0.514491 0.154295 0.165006 0.121248 0.240507 0.447016 0.068079 0.005624 0.250999 0.091723 -0.049166
max_dew_point_f 0.595012 0.761181 0.798905 1.000000 0.943105 0.842477 0.382153 0.402734 0.342451 -0.508217 -0.477169 -0.436924 0.164558 0.097181 -0.033532 0.152299 0.233341 -0.034887 0.051436 0.201694 0.111538 -0.072593
mean_dew_point_f 0.505098 0.725137 0.824766 0.943105 1.000000 0.952300 0.414804 0.502307 0.491508 -0.535617 -0.499858 -0.448049 0.168403 0.076285 -0.049897 0.168825 0.316432 0.002301 0.052520 0.310847 0.085541 -0.053296
min_dew_point_f 0.397487 0.638972 0.780979 0.842477 0.952300 1.000000 0.386314 0.550241 0.585639 -0.503820 -0.467482 -0.412809 0.141436 0.056657 -0.049911 0.149080 0.323991 0.005118 0.046376 0.354173 0.047458 -0.061171
max_humidity -0.216879 -0.128456 -0.019031 0.382153 0.414804 0.386314 1.000000 0.787834 0.535602 -0.005124 -0.025336 -0.037528 -0.024105 -0.243392 -0.424447 -0.046738 -0.045741 -0.072192 0.137592 0.320823 -0.054455 0.021128
mean_humidity -0.370503 -0.153018 0.118853 0.402734 0.502307 0.550241 0.787834 1.000000 0.870298 -0.057553 -0.068959 -0.076246 -0.057358 -0.293365 -0.468310 -0.020438 0.014216 -0.038198 0.168503 0.515528 -0.103110 -0.177231
min_humidity -0.417841 -0.139305 0.196733 0.342451 0.491508 0.585639 0.535602 0.870298 1.000000 -0.095147 -0.096136 -0.087180 0.036518 -0.214135 -0.410531 0.075143 0.203540 0.048904 0.180549 0.601872 -0.135133 -0.171353
max_sea_level_pressure_inches -0.439687 -0.576732 -0.616909 -0.508217 -0.535617 -0.503820 -0.005124 -0.057553 -0.095147 1.000000 0.979604 0.924433 -0.115323 -0.127162 -0.090063 -0.253412 -0.395362 -0.140740 -0.132514 -0.144374 -0.025768 -0.025516
mean_sea_level_pressure_inches -0.393728 -0.531648 -0.578559 -0.477169 -0.499858 -0.467482 -0.025336 -0.068959 -0.096136 0.979604 1.000000 0.970161 -0.093683 -0.097052 -0.049255 -0.250560 -0.392096 -0.155233 -0.176161 -0.162992 0.003900 -0.033449
min_sea_level_pressure_inches -0.352066 -0.472030 -0.514491 -0.436924 -0.448049 -0.412809 -0.037528 -0.076246 -0.087180 0.924433 0.970161 1.000000 -0.064447 -0.063242 -0.010317 -0.235962 -0.346639 -0.170947 -0.197074 -0.161405 0.016946 -0.024084
max_visibility_miles 0.117971 0.146534 0.154295 0.164558 0.168403 0.141436 -0.024105 -0.057358 0.036518 -0.115323 -0.093683 -0.064447 1.000000 0.648950 0.119889 0.059907 0.193409 -0.239640 -0.037091 0.004822 0.053409 -0.014964
mean_visibility_miles 0.255346 0.232811 0.165006 0.097181 0.076285 0.056657 -0.243392 -0.293365 -0.214135 -0.127162 -0.097052 -0.063242 0.648950 1.000000 0.648762 0.081324 0.192977 -0.105146 -0.196960 -0.191155 0.095036 -0.008562
min_visibility_miles 0.340895 0.260410 0.121248 -0.033532 -0.049897 -0.049911 -0.424447 -0.468310 -0.410531 -0.090063 -0.049255 -0.010317 0.119889 0.648762 1.000000 0.063521 0.105577 -0.003152 -0.274907 -0.373926 0.135392 0.031736
max_wind_Speed_mph 0.085911 0.176291 0.240507 0.152299 0.168825 0.149080 -0.046738 -0.020438 0.075143 -0.253412 -0.250560 -0.235962 0.059907 0.081324 0.063521 1.000000 0.560446 0.417987 0.117279 0.119712 -0.020161 0.006413
mean_wind_speed_mph 0.066185 0.284251 0.447016 0.233341 0.316432 0.323991 -0.045741 0.014216 0.203540 -0.395362 -0.392096 -0.346639 0.193409 0.192977 0.105577 0.560446 1.000000 0.336824 0.132111 0.326836 -0.046142 0.042659
max_gust_speed_mph -0.040446 0.013383 0.068079 -0.034887 0.002301 0.005118 -0.072192 -0.038198 0.048904 -0.140740 -0.155233 -0.170947 -0.239640 -0.105146 -0.003152 0.417987 0.336824 1.000000 0.158806 0.117228 -0.029460 0.073607
precipitation_inches -0.133768 -0.073244 0.005624 0.051436 0.052520 0.046376 0.137592 0.168503 0.180549 -0.132514 -0.176161 -0.197074 -0.037091 -0.196960 -0.274907 0.117279 0.132111 0.158806 1.000000 0.248035 -0.152272 0.013492
cloud_cover -0.307077 -0.040235 0.250999 0.201694 0.310847 0.354173 0.320823 0.515528 0.601872 -0.144374 -0.162992 -0.161405 0.004822 -0.191155 -0.373926 0.119712 0.326836 0.117228 0.248035 1.000000 -0.185057 0.059479
wind_dir_degrees 0.218432 0.168688 0.091723 0.111538 0.085541 0.047458 -0.054455 -0.103110 -0.135133 -0.025768 0.003900 0.016946 0.053409 0.095036 0.135392 -0.020161 -0.046142 -0.029460 -0.152272 -0.185057 1.000000 -0.018013
zip_code 0.110797 0.051660 -0.049166 -0.072593 -0.053296 -0.061171 0.021128 -0.177231 -0.171353 -0.025516 -0.033449 -0.024084 -0.014964 -0.008562 0.031736 0.006413 0.042659 0.073607 0.013492 0.059479 -0.018013 1.000000

Quoted from this link http://janda.org/c10/Lectures/topic06/L24-significanceR.htm:

For large samples, it is easy to achieve significance, and one must pay attention to the strength of the correlation to determine if the relationship explains very much.

So the strength of the correlation is more important for the bikeshare data.

In [154]:
r_squared = 0.5 # Set the coefficient of determination to be 0.5 so that more than half of the variance can be explanable.

# Obtain the correlated features.
weather_corr = []
for col in weather_corr_table.columns:
    feature = weather_corr_table[col]
    corr_cols = feature[(feature.pow(2) > r_squared) & (feature != 1)]
    corr_idx = corr_cols.index.values.tolist()
    corr_dict = {col: corr_idx}
    if corr_idx != []:
        weather_corr.append(corr_dict)
In [155]:
display(weather_corr)
[{'max_temperature_f': ['mean_temperature_f']},
 {'mean_temperature_f': ['max_temperature_f',
   'min_temperature_f',
   'max_dew_point_f',
   'mean_dew_point_f']},
 {'min_temperature_f': ['mean_temperature_f',
   'max_dew_point_f',
   'mean_dew_point_f',
   'min_dew_point_f']},
 {'max_dew_point_f': ['mean_temperature_f',
   'min_temperature_f',
   'mean_dew_point_f',
   'min_dew_point_f']},
 {'mean_dew_point_f': ['mean_temperature_f',
   'min_temperature_f',
   'max_dew_point_f',
   'min_dew_point_f']},
 {'min_dew_point_f': ['min_temperature_f',
   'max_dew_point_f',
   'mean_dew_point_f']},
 {'max_humidity': ['mean_humidity']},
 {'mean_humidity': ['max_humidity', 'min_humidity']},
 {'min_humidity': ['mean_humidity']},
 {'max_sea_level_pressure_inches': ['mean_sea_level_pressure_inches',
   'min_sea_level_pressure_inches']},
 {'mean_sea_level_pressure_inches': ['max_sea_level_pressure_inches',
   'min_sea_level_pressure_inches']},
 {'min_sea_level_pressure_inches': ['max_sea_level_pressure_inches',
   'mean_sea_level_pressure_inches']}]

It turns out that:

  • 'mean_temperature_f' can well represent 'max_temperature_f', 'min_temperature_f', 'max_dew_point_f' and 'mean_dew_point_f', but not 'min_dew_point_f';
  • 'mean_humidity' can well represent 'max_humidity' and 'min_humidity';
  • 'mean_sea_level_pressure_inches' can well represent 'max_sea_level_pressure_inches' and 'min_sea_level_pressure_inches'.

All dependent features will be removed to reduce the dimension of the data.

In [156]:
# Explore the categorical column
weather.events.unique()
Out[156]:
array([nan, 'Fog', 'Rain', 'Fog-Rain', 'rain', 'Rain-Thunderstorm'], dtype=object)
In [157]:
weather.loc[weather.events == 'rain', 'events'] = 'Rain'
print(weather.events.unique())
[nan 'Fog' 'Rain' 'Fog-Rain' 'Rain-Thunderstorm']
In [158]:
# Create a weather data with independent columns and ONEHOT the events column.
weather_ind = pd.get_dummies(weather.drop(labels=['max_temperature_f', 'min_temperature_f', 'max_dew_point_f', 'mean_dew_point_f', 'max_humidity', 'min_humidity', 'max_sea_level_pressure_inches', 'min_sea_level_pressure_inches'], axis=1), columns=['events'])

It is strange to find low correlation between the max wind speed and the max gust speed. In fact, this depends on the region:

In [159]:
print('The correlation between the max wind speed and the max gust speed is: \n')
for region in regions:
    regionWeather = weather[weather.zip_code == region]
    corrCoef = pearsonr(regionWeather.max_wind_Speed_mph, regionWeather.max_gust_speed_mph)[0]
    print('{} : {:.2f}'.format(region_names[region], corrCoef))
The correlation between the max wind speed and the max gust speed is: 

San Francisco : 0.75
Redwood City : 0.39
Palo Alto : 0.07
Mountain View : 0.51
San Jose : 0.90
In [160]:
plot_count = 1
plt.figure(figsize=(15, 20))
for region in regions:
    weather_region = weather.loc[weather.zip_code == region, :]
    plt.subplot(5, 1, plot_count)
    plt.scatter(weather_region.max_wind_Speed_mph, 
                weather_region.max_gust_speed_mph, 
                label='{}'.format(region_names[region]))
    plt.xticks(np.arange(0, 45, 5))
    plt.xlim([0, 45])
    plt.xlabel('Max Wind (mph)')
    plt.ylabel('Max Gust (mph)')
    plot_count += 1
    plt.legend()
    plt.grid()

Interestingly, the correlation is actually high for San Francisco and San Jose while very low for the other three cities with the latter sitting in the deep valley close by the mountain and the bay.

4.4 Simplify the column names.

In [161]:
weather_ind.columns
Out[161]:
Index(['date', 'mean_temperature_f', 'min_dew_point_f', 'mean_humidity',
       'mean_sea_level_pressure_inches', 'max_visibility_miles',
       'mean_visibility_miles', 'min_visibility_miles', 'max_wind_Speed_mph',
       'mean_wind_speed_mph', 'max_gust_speed_mph', 'precipitation_inches',
       'cloud_cover', 'wind_dir_degrees', 'zip_code', 'events_Fog',
       'events_Fog-Rain', 'events_Rain', 'events_Rain-Thunderstorm'],
      dtype='object')
In [162]:
weather_ind.columns= ['date', 'Temp', 'Dew', 'Humid', 'Pressure', 'Max_Vis', 'Mean_Vis', 'Min_Vis', 'Max_Wind', 'Mean_Wind', 'Gust', 'Precip', 'Cloud', 'Wind_Deg',  'zip_code', 'Fog', 'Fog_Rain', 'Rain', 'Thunder']
In [163]:
weather_ind.head(1)
Out[163]:
date Temp Dew Humid Pressure Max_Vis Mean_Vis Min_Vis Max_Wind Mean_Wind Gust Precip Cloud Wind_Deg zip_code Fog Fog_Rain Rain Thunder
0 8/29/2013 68.0 56.0 75.0 30.02 10.0 10.0 10.0 23.0 11.0 28.0 0.0 4.0 286.0 94107 0 0 0 0

4.5 Explore monthly variations for the features.

In [164]:
# Previous exploration suggests that the wind degree column has an outlier point. I will correct this point before making the plots.
weather_ind[weather_ind.Wind_Deg > 360]
Out[164]:
date Temp Dew Humid Pressure Max_Vis Mean_Vis Min_Vis Max_Wind Mean_Wind Gust Precip Cloud Wind_Deg zip_code Fog Fog_Rain Rain Thunder
216 9/30/2013 66.0 41.0 73.0 30.05 10.0 10.0 9.0 14.0 4.0 17.0 0.0 4.0 2772.0 94063 0 0 0 0
In [165]:
# This seems to be a typo with an extra '7' in the middle.
weather_ind.loc[weather_ind.Wind_Deg > 360, 'Wind_Deg'] = 272
In [166]:
weather_month = weather_ind.date.apply(lambda x: int(x.split('/')[0]))
In [167]:
# Plot all the numeric feature columns versus months
plot_cols = weather_ind.columns[1:14]

plt.figure(figsize=(25, 15))
plot_idx = 1
for col in plot_cols:
    plt.subplot(3, 5, plot_idx)
    plt.scatter(weather_month, weather_ind[col])
    plt.xlabel('Month')
    plt.title(col)
    plot_idx += 1

The above features don't show significant dependence on each other. This is obvious by looking at the point patterns. Some features have obvious seasonal changes, like the temperature, the dew point, the sea level pressure, the precipitation; some are less variable in seasons, like the visibility, the wind speed, gust speeds and the wind degrees; the others are not dependent on the season, like the cloud cover. No matter what, I feel all these features will affect people's choice in bike riding, so all will be kept for analysis.

4.6 Convert the date to datetime format.

In [168]:
# The conversion is necessary for the combination with other data
weather_ind.date = weather_ind.date.apply(lambda x: datetime.strptime(x, "%m/%d/%Y").date())

Datasets Generation

Drop columns that are not relevant for the datasets:

  1. 'id'. Trip id is unique for a specific trip. It doesn't contain any trip pattern.
  2. 'start_date' and 'end_date'. The previously generated 'date' and 'hour' contain the relevant information already.
  3. 'start_station_id', 'start_station_name', 'end_station_id' and 'end_station_name'. The station groups will be used instead to reduce the complexity of the problem, which reflects the essence of the problem.
  4. 'zip_code'. The useful information has been extracted into the column 'local_zip'.
  5. 'bike_id'. Bike id should not be an important factor as all the sharebikes look and function similarly.

Pretreatment of the trip data.

In [169]:
trip.columns
Out[169]:
Index(['id', 'duration', 'start_date', 'start_station_name',
       'start_station_id', 'end_date', 'end_station_name', 'end_station_id',
       'bike_id', 'subscription_type', 'zip_code', 'station_region',
       'inter_group', 'hour', 'date', 'BDay', 'weekday', 'month', 'local_zip',
       'before_Oct2013'],
      dtype='object')
In [170]:
# Drop columns that not relevant for both datasets.
drop_cols = ['id', 'start_date', 'start_station_name', 'start_station_id', 'end_date', 'end_station_name', 'end_station_id', 'bike_id', 'zip_code']
trip_use = trip.drop(drop_cols, axis=1)
In [171]:
print(trip_use.shape)
trip_use.head(1)
(669959, 11)
Out[171]:
duration subscription_type station_region inter_group hour date BDay weekday month local_zip before_Oct2013
0 63 Subscriber 94107 0 14 2013-08-29 True 3 8 True True

Define valid trips.

In [172]:
# Only consider durations less than 12 hours as valid trips.
duration_12hours = 60*60*12
trip_final = trip_use[trip_use.duration <= duration_12hours]
In [173]:
print('{:.1f}% of trips are within 12 hours.'.format(len(trip_final) / len(trip_use) * 100))
99.8% of trips are within 12 hours.

Inspect whether there are nan values in the two data

In [174]:
trip_final.isna().sum()
Out[174]:
duration             0
subscription_type    0
station_region       0
inter_group          0
hour                 0
date                 0
BDay                 0
weekday              0
month                0
local_zip            0
before_Oct2013       0
dtype: int64

Investigate the weather data.

In [175]:
print(weather_ind.shape)
weather_ind.head(1)
(3665, 19)
Out[175]:
date Temp Dew Humid Pressure Max_Vis Mean_Vis Min_Vis Max_Wind Mean_Wind Gust Precip Cloud Wind_Deg zip_code Fog Fog_Rain Rain Thunder
0 2013-08-29 68.0 56.0 75.0 30.02 10.0 10.0 10.0 23.0 11.0 28.0 0.0 4.0 286.0 94107 0 0 0 0
In [176]:
weather_ind.isna().sum()
Out[176]:
date         0
Temp         0
Dew          0
Humid        0
Pressure     0
Max_Vis      0
Mean_Vis     0
Min_Vis      0
Max_Wind     0
Mean_Wind    0
Gust         0
Precip       0
Cloud        0
Wind_Deg     0
zip_code     0
Fog          0
Fog_Rain     0
Rain         0
Thunder      0
dtype: int64

1. Dataset for Regression

In [177]:
trip_final.columns
Out[177]:
Index(['duration', 'subscription_type', 'station_region', 'inter_group',
       'hour', 'date', 'BDay', 'weekday', 'month', 'local_zip',
       'before_Oct2013'],
      dtype='object')

The goal of tackling the problem for the daily trip counts is to predict the daily trips in the future. Thus, only those features that we know in advance or can be forecast in the future will be used. Anything that is related to the statistics of the trips during the day is not appropriate as the feature, e.g. the average duration of the trips, the ratio of subscribers over non-subscribers, the frequency of inter-group trips, the number of people that comes from a local zip code, etc. On the contrast, a specific date in the future has the following information known beforehand: the year, month, day, whether it is a business day and before October 2013, which weekday it is. However, one needs to be careful when using the year information. Since we are predicting the future, the year may either be irrelevant or can leak information from the test set to the training set. So I decide not to include the year as a feature for the regression.

In [178]:
trip_rgs = trip_final[['date', 'station_region', 'hour', 'month', 'BDay', 'weekday', 'before_Oct2013']]
In [179]:
trip_rgs.head()
Out[179]:
date station_region hour month BDay weekday before_Oct2013
0 2013-08-29 94107 14 8 True 3 True
1 2013-08-29 95113 14 8 True 3 True
2 2013-08-29 94041 10 8 True 3 True
3 2013-08-29 95113 11 8 True 3 True
4 2013-08-29 94107 12 8 True 3 True

Five datasets for regression will be created for different regions / station groups.

In [180]:
# Define a function to be used in the below cell.
def region_weather_trips(data, weather):
    region_data = data.groupby('date')['BDay', 'weekday', 'month', 'before_Oct2013'].median()
    region_data['trip_counts'] = data.groupby('date')['hour'].count()   
    region_data.columns=['BDay', 'weekday', 'month', 'before_Oct2013', 'trip_counts']
    # region_data.rename(columns={'median': 'BDay', 'count': 'trip_counts'}, inplace=True)
    
    # Merge the region data with weather data. The inner join is used as some dates may not be in the region data and the weather data of the missing dates shouldn't be included in the merged data.
    region_weather_merge = pd.merge(region_data, weather, left_index=True, right_on='date') 
    region_weather_merge.set_index('date', inplace=True) # Set the date as the index after merging
    region_weather_merge.drop('zip_code', axis=1, inplace=True)
    
    return region_weather_merge
In [181]:
# Generate five datasets as a dictionary with the zip code as the key.
trip_count_rgs = {}
for group in regions:
    region = trip_rgs[trip_rgs.station_region == group].drop('station_region', axis=1) # Obtain region by zip code
    region_weather = weather_ind[weather_ind.zip_code == group] # Obtain region weather by zip code
    
    region_daily = region_weather_trips(region, region_weather)
    region_am = region_weather_trips(region[region.hour < 13], region_weather)
    region_pm = region_weather_trips(region[region.hour > 12], region_weather)
    
    trip_count_rgs[group] = [region_daily, region_am, region_pm]
    
    # Use OneHot for both the weekday and the month as both columns are actually categorical features rather than numeric features. This may not affect the decision tree algorithms, but will help with the linear regression algorithms.
    # trip_count_rgs[group] = pd.get_dummies(region_daily, columns=['weekday', 'month'])
    # trip_count_rgs[group] = region_daily
In [182]:
len(trip_count_rgs)
Out[182]:
5
In [183]:
# Some regions don't have data for all dates. There are totally 733 days. This also confirms the inner merge.
print(len(trip_count_rgs[94041][0]))
print(len(trip_count_rgs[95113][0]))
print(len(weather)//5) # Five regions.
725
733
733

Add the total dock counts for each day. The number of total counts is an important parameter for trip count prediction.

In [184]:
# Convert the installation date to datetime object
station.installation_date = pd.to_datetime(station.installation_date, format = "%m/%d/%Y").dt.date
In [185]:
# Calculate the total dock counts for each region on each day
# The count_docks function is used to map the date (index) to the total docks.
def count_docks(date):
    return sum(s_group[s_group.installation_date <= date].dock_count)

for region, group in trip_count_rgs.items():
    s_group = station[station.region == region]
    group[0]['total_docks'] = group[0].index.map(count_docks) # group[0] is the daily trip
    group[1]['total_docks'] = group[1].index.map(count_docks) # group[1] is the morning trip
    group[2]['total_docks'] = group[2].index.map(count_docks) # group[2] is the afternoon trip

2. Dataset for Classification

In [186]:
trip_final.head(1)
Out[186]:
duration subscription_type station_region inter_group hour date BDay weekday month local_zip before_Oct2013
0 63 Subscriber 94107 0 14 2013-08-29 True 3 8 True True
In [187]:
weather_ind.head(1)
Out[187]:
date Temp Dew Humid Pressure Max_Vis Mean_Vis Min_Vis Max_Wind Mean_Wind Gust Precip Cloud Wind_Deg zip_code Fog Fog_Rain Rain Thunder
0 2013-08-29 68.0 56.0 75.0 30.02 10.0 10.0 10.0 23.0 11.0 28.0 0.0 4.0 286.0 94107 0 0 0 0
In [188]:
# Merge the trip and weather data based on the date and the region.
trip_weather_merge = pd.merge(trip_final, weather_ind, left_on=['date', 'station_region'], right_on=['date', 'zip_code'])
In [189]:
# Check the sizes of the data before and after merge
print(trip_final.shape)
print(weather_ind.shape)
print(trip_weather_merge.shape)
(668778, 11)
(3665, 19)
(668778, 29)

The merge makes sense: the total number of rows is the same as the trip data. Also, the number of columns equals to the sum of trip and weather data's minus one (same column name for the date, different columns names for the region).

In [190]:
# Remove the date column. All features about a day have been extracted, so the date itself is not relevant.
# Remove the zip_code column as it is a redundant column as the station_region column.
trip_weather_merge.drop(['date', 'zip_code'], axis=1, inplace=True)
In [191]:
# Create the dataset for classfication problem
subscriber_cls = pd.get_dummies(trip_weather_merge, columns=['station_region'])
In [192]:
print('{} rows of the classification data contain the NaN values.'.format(len(subscriber_cls[subscriber_cls.isnull().any(axis=1)])))
0 rows of the classification data contain the NaN values.

Modeling and Fitting

1. Regression Problem for Daily Trip Counts Prediction

In [193]:
trip_rgs.head()
Out[193]:
date station_region hour month BDay weekday before_Oct2013
0 2013-08-29 94107 14 8 True 3 True
1 2013-08-29 95113 14 8 True 3 True
2 2013-08-29 94041 10 8 True 3 True
3 2013-08-29 95113 11 8 True 3 True
4 2013-08-29 94107 12 8 True 3 True
In [194]:
# Show an example of a full dataset in a region with data of daily trip counts.
trip_count_rgs[94107][0].head(1)
Out[194]:
BDay weekday month before_Oct2013 trip_counts Temp Dew Humid Pressure Max_Vis Mean_Vis Min_Vis Max_Wind Mean_Wind Gust Precip Cloud Wind_Deg Fog Fog_Rain Rain Thunder total_docks
date
2013-08-29 True 3 8 True 639 68.0 56.0 75.0 30.02 10.0 10.0 10.0 23.0 11.0 28.0 0.0 4.0 286.0 0 0 0 0 650
In [195]:
# Display the regions in the datasets.
print(trip_count_rgs.keys())
dict_keys([94107, 94063, 94301, 94041, 95113])
In [196]:
region_names[94107]
Out[196]:
'San Francisco'

1.1 Predict the number of daily trips for each station

In [197]:
# A function to predict the label with a certain regressor
def rgs_pred(trip_data, rgs):
    features = trip_data.drop('trip_counts', axis=1)
    counts = trip_data.trip_counts
    
    trainX = features.loc[X_train.index]
    trainY = counts.loc[y_train.index]
    testX = features.loc[X_test.index]
    
    rgs.fit(trainX, trainY)
    
    return rgs.predict(testX)
In [198]:
# This regressor is the one found to have the best performance.
gbr = GradientBoostingRegressor(learning_rate = 0.3,
                                n_estimators = 50,
                                max_depth = 8,
                                min_samples_leaf = 3,
                                random_state = random_state)

for region in regions:
    region_trips = trip_count_rgs[region][0]
    
    region_features = region_trips.drop('trip_counts', axis=1)
    region_counts = region_trips.trip_counts
    
    X_train, X_test, y_train, y_test = train_test_split(region_features, region_counts, test_size=0.2, random_state=random_state)
    
    y_pred = rgs_pred(region_trips, gbr)
    median_daily_trips = np.median(y_train)
    y_benchmark = np.ones(len(y_test)) * median_daily_trips
    average_total_docks = np.mean(X_train.total_docks)
    
    print('For {} (median {:.0f} trips daily with {:.0f} docks):'.format(region_names[region], median_daily_trips, average_total_docks))
    print('The median absolute error for the regular prediction is {:.1f}.'.format(median_absolute_error(y_test, y_pred)))
    print('The median absolute error for the benchmark prediction is {:.1f}. \n'.format(median_absolute_error(y_test, y_benchmark)))
    # print('The root mean square logarithmic error is {:.1f}%.'.format(np.sqrt(mean_squared_log_error(y_test, y_pred))*100))
    # print('The root mean square logarithmic error is {:.1f}%. \n\n'.format(np.sqrt(mean_squared_log_error(y_test, y_benchmark))*100))
For San Francisco (median 917 trips daily with 662 docks):
The median absolute error for the regular prediction is 56.8.
The median absolute error for the benchmark prediction is 260.0. 

For Redwood City (median 4 trips daily with 113 docks):
The median absolute error for the regular prediction is 2.2.
The median absolute error for the benchmark prediction is 2.0. 

For Palo Alto (median 9 trips daily with 75 docks):
The median absolute error for the regular prediction is 3.1.
The median absolute error for the benchmark prediction is 3.0. 

For Mountain View (median 26 trips daily with 112 docks):
The median absolute error for the regular prediction is 3.9.
The median absolute error for the benchmark prediction is 13.5. 

For San Jose (median 55 trips daily with 257 docks):
The median absolute error for the regular prediction is 9.0.
The median absolute error for the benchmark prediction is 13.0. 

I didn't pay attention to the number of daily trips for each region. This result is astonishing! Except San Francisco, there are so few trips in a day in the other regions. The number of trips in San Francisco is much higher than the docks, while in other cities, not all bikes are used daily! The prediction with the regressor performs even worse than the a random guess (median of the label column in the training set) for those stations with very small median daily trips, i.e. in Redwood City and Palo Alto. The regressor performs much better than the benchmark for the other three cities.

1.2 Improve the model for San Francisco trip data

1.2.1 Normalize numeric feature columns

In [199]:
# Collect all numeric features in the trip data
trip_num_features = ['weekday', 'month', 'Temp', 'Dew', 'Humid', 'Pressure', 'Max_Vis', 'Mean_Vis', 'Min_Vis', 'Max_Wind', 'Mean_Wind', 'Gust', 'Precip', 'Cloud', 'Wind_Deg', 'total_docks']
In [200]:
# Investigate the trips in San Francisco. The numeric features are normalized to accommodate all the regressors.
trips_SF = trip_count_rgs[94107][0]
features_SF = trips_SF.drop('trip_counts', axis=1)
features_SF[trip_num_features] = MinMaxScaler().fit_transform(features_SF[trip_num_features])
counts_SF = trips_SF.trip_counts

X_train, X_test, y_train, y_test = train_test_split(features_SF, counts_SF, test_size=0.2, random_state=random_state)

1.2.2 Select proper models by investigating all regressors

In [201]:
mae_scorer = make_scorer(median_absolute_error, greater_is_better=False) # mae: median absolute error
# Define a clear cross validation set
# Shuffle Split is used instead of KFold. A better method for a small dataset.
cv_sets = ShuffleSplit(n_splits=15, test_size = 0.20, random_state=random_state) 
# cv_sets = KFold(n_splits=15, shuffle=True, random_state=random_state)
In [202]:
# The original functions comes from: https://www.kaggle.com/currie32/a-model-to-predict-number-of-daily-trips/notebook
def scoring(rgs):
    scores = cross_val_score(rgs, X_train, y_train, cv=cv_sets, n_jobs=1, scoring = mae_scorer)
    return np.mean(scores)
In [203]:
# Check the performance of all regressors with default setting
regressors = [DummyRegressor,
              AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor, RandomForestRegressor,
              GaussianProcessRegressor,
              HuberRegressor, Lasso, LinearRegression, PassiveAggressiveRegressor, RANSACRegressor, Ridge, SGDRegressor, TheilSenRegressor,
              KernelRidge,
              KNeighborsRegressor,
              MLPRegressor,
              LinearSVR, NuSVR, SVR,
              DecisionTreeRegressor, ExtraTreeRegressor,
              XGBRegressor]

rgs_dict = {} # Create a regressor dictionary to record the regressor with its score
for rgs in regressors:
    begTime = time() # Get the beginning time
    
    # All parameters are set by default.
    # Unfortunately, not all regressors have a random_state parameter, so each run will give a slightly different ranking.
    try:
        rgs_function = rgs(random_state=random_state) 
    except:
        rgs_function = rgs()
        
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        rgs_error = scoring(rgs_function) * (-1) # Error is opposite of score
        
    rgs_dict[rgs.__name__] = rgs_error
    
    useTime = time() - begTime
    print('The regressor {} takes {:.2f} seconds and has a median absolute error of {:.1f}.'.format(rgs.__name__, useTime, rgs_error))    
The regressor DummyRegressor takes 0.07 seconds and has a median absolute error of 363.9.
The regressor AdaBoostRegressor takes 2.07 seconds and has a median absolute error of 95.2.
The regressor BaggingRegressor takes 0.73 seconds and has a median absolute error of 65.6.
The regressor ExtraTreesRegressor takes 0.57 seconds and has a median absolute error of 65.9.
The regressor GradientBoostingRegressor takes 1.04 seconds and has a median absolute error of 69.2.
The regressor RandomForestRegressor takes 0.72 seconds and has a median absolute error of 65.5.
The regressor GaussianProcessRegressor takes 0.90 seconds and has a median absolute error of 120.0.
The regressor HuberRegressor takes 0.91 seconds and has a median absolute error of 83.0.
The regressor Lasso takes 0.10 seconds and has a median absolute error of 92.0.
The regressor LinearRegression takes 0.09 seconds and has a median absolute error of 92.7.
The regressor PassiveAggressiveRegressor takes 0.09 seconds and has a median absolute error of 91.3.
The regressor RANSACRegressor takes 1.96 seconds and has a median absolute error of 79.6.
The regressor Ridge takes 0.09 seconds and has a median absolute error of 92.3.
The regressor SGDRegressor takes 0.08 seconds and has a median absolute error of 150.3.
The regressor TheilSenRegressor takes 32.72 seconds and has a median absolute error of 79.8.
The regressor KernelRidge takes 0.34 seconds and has a median absolute error of 93.1.
The regressor KNeighborsRegressor takes 0.15 seconds and has a median absolute error of 71.3.
The regressor MLPRegressor takes 9.55 seconds and has a median absolute error of 542.7.
The regressor LinearSVR takes 0.10 seconds and has a median absolute error of 258.7.
The regressor NuSVR takes 0.30 seconds and has a median absolute error of 371.1.
The regressor SVR takes 0.40 seconds and has a median absolute error of 295.6.
The regressor DecisionTreeRegressor takes 0.14 seconds and has a median absolute error of 81.0.
The regressor ExtraTreeRegressor takes 0.09 seconds and has a median absolute error of 81.6.
The regressor XGBRegressor takes 0.78 seconds and has a median absolute error of 70.3.

TheilSenRegressor and MLPRegressor take the longest time to process but not with the best scores.

In [204]:
# Plot the scores for all the regressor
pd.Series(rgs_dict).sort_values(ascending=False).plot.barh(figsize=(15, 10), grid=True, fontsize=12)
plt.xticks(np.arange(0, 600, 50))
plt.xlabel('Mean Median Absolute Error from Cross Validation')
plt.show()

For regressors, we can use two benchmark models: the dummy regressor which always predicts the median value of the training set and the plain linear regression mode. Interestingly, some regressors perform worse than the dummy regressor (random guess): NuSVR and MLPRegressor. This doesn't necessarily mean that these two algorithms are not good. It only suggests that the default parameter setting doesn't give the model good performance on this dataset. Similarly, those regressors that have larger error than the linear regression method don't necessarily mean they are bad regressors. It is possible that some regressors have a great potential to improve by adjusting its parameters. However, one cannot explore all the regressors with all possible values of parameters. I will choose investigate further only for three regressors on the top list: Bagging Regressor, Gradient boosting Regressor and Random Forest Regressor. Note that the result may be slightly different for each run, but these three are always on the top list.

In [205]:
rfr_params = {'n_estimators':[20, 30, 40], 'min_samples_leaf':[1, 2, 3]}
br_params = {'n_estimators':[20, 30, 40], 'max_samples':[0.2, 0.5, 1.0], 'max_features':[0.2, 0.5, 1.0]}
gbr_params = {'learning_rate':[0.02, 0.05, 0.08], 'n_estimators':[150, 200, 250], 'min_samples_leaf':[3, 4, 5], 'max_depth':[8, 9, 10]}
In [206]:
regressors = [RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor] # Note that the regressor needs to be a function rather than the name itself.
rgs_params = [rfr_params, br_params, gbr_params]

best_rgs = {}
for rgs, rgs_params in zip(regressors, rgs_params):
    
    grid = GridSearchCV(estimator = rgs(random_state=random_state), param_grid = rgs_params, scoring = mae_scorer, cv = cv_sets, n_jobs=-1, verbose=1)
    grid.fit(X_train, y_train)
    
    print('It takes {:.1f} seconds to grid search the regressor {}. The median absolute error is {:.1f}.'.format(useTime, rgs.__name__, (-1) * grid.best_score_))
    best_rgs[rgs.__name__] = grid.best_estimator_
Fitting 15 folds for each of 9 candidates, totalling 135 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 132 out of 135 | elapsed:    9.6s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    9.9s finished
It takes 0.8 seconds to grid search the regressor RandomForestRegressor. The median absolute error is 62.8.
Fitting 15 folds for each of 27 candidates, totalling 405 fits
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done 402 out of 405 | elapsed:   20.9s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:   21.3s finished
It takes 0.8 seconds to grid search the regressor BaggingRegressor. The median absolute error is 62.4.
Fitting 15 folds for each of 81 candidates, totalling 1215 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   11.8s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:   49.1s
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed:  3.4min
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  5.1min finished
It takes 0.8 seconds to grid search the regressor GradientBoostingRegressor. The median absolute error is 61.3.
In [207]:
best_rgs
Out[207]:
{'BaggingRegressor': BaggingRegressor(base_estimator=None, bootstrap=True,
          bootstrap_features=False, max_features=1.0, max_samples=1.0,
          n_estimators=30, n_jobs=1, oob_score=False, random_state=16,
          verbose=0, warm_start=False),
 'GradientBoostingRegressor': GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
              learning_rate=0.05, loss='ls', max_depth=9, max_features=None,
              max_leaf_nodes=None, min_impurity_decrease=0.0,
              min_impurity_split=None, min_samples_leaf=5,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=200, presort='auto', random_state=16,
              subsample=1.0, verbose=0, warm_start=False),
 'RandomForestRegressor': RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
            max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=1,
            oob_score=False, random_state=16, verbose=0, warm_start=False)}
In [208]:
bestRegressor = best_rgs['GradientBoostingRegressor']
In [209]:
# Predict the daily trips using daily data
bestRegressor.fit(X_train, y_train)
SF_daily_y_pred = bestRegressor.predict(X_test)

print('The median absolute error is {:.1f} with the best estimator using the daily trip data.'.format(median_absolute_error(y_test, SF_daily_y_pred)))
The median absolute error is 54.5 with the best estimator using the daily trip data.

1.3 Check if dividing a day into several sections help with the prediction power

Divide by different hours. To predict the number of trips on a particular day, the trip data need to be averaged to certain time sections. From the exploration, I decided to divide the data by the noon point, i.e. one division from 0 am to 12 pm and the other from 1 pm to 11 pm. Aggregating data into certain time slots will make certain feature columns not relevant unless the each element in the feature will be investigated individually (e.g. investigate only the subscriber's behavior or only the non-subscriber's behavior).

In [210]:
def pred_trip_count(trip_data, bestRegressor, train_index, test_index):
    features = trip_data.drop('trip_counts', axis=1)
    features[trip_num_features] = MinMaxScaler().fit_transform(features[trip_num_features])
    counts = trip_data.trip_counts
    
    trip_X_train = features.loc[train_index]
    trip_y_train = counts.loc[train_index]
    trip_X_test =features.loc[test_index]
    
    bestRegressor.fit(trip_X_train, trip_y_train)
    
    # feature_importance = pd.Series(dict(zip(trip_X_train.columns, bestRegressor.feature_importances_)))
    
    return bestRegressor.predict(trip_X_test), bestRegressor.feature_importances_
In [211]:
SF_am = trip_count_rgs[94107][1]
SF_pm = trip_count_rgs[94107][2]

train_index = X_train.index
test_index = X_test.index
In [212]:
# Predict the daily trips using morning and afternoon data
SF_am_y_pred, SF_am_importance = pred_trip_count(SF_am, bestRegressor, train_index, test_index)
SF_pm_y_pred, SF_pm_importance = pred_trip_count(SF_pm, bestRegressor, train_index, test_index)
SF_ampm_y_pred = SF_am_y_pred + SF_pm_y_pred
In [213]:
# Calculate the median value of the y train as the benchmark prediction
y_pred_benchmark = np.ones(len(y_test)) * y_train.median()
In [214]:
# Check the median absolute error metric.
print('The median absolute error is {:.1f} with the best estimator using the daily trip data.'.format(median_absolute_error(y_test, SF_daily_y_pred)))
print('The median absolute error is {:.1f} with the best estimator using the am+pm trip data.'.format(median_absolute_error(y_test, SF_ampm_y_pred)))
print('The median absolute error is {:.1f} with the benchmark model using the daily trip data.'.format(median_absolute_error(y_test, y_pred_benchmark)))
The median absolute error is 54.5 with the best estimator using the daily trip data.
The median absolute error is 54.2 with the best estimator using the am+pm trip data.
The median absolute error is 260.0 with the benchmark model using the daily trip data.
In [215]:
# Check the root mean square logarithmic error metric.
print('The root mean square logarithmic error is {:.4f} with the best estimator using the daily trip data.'.format(np.sqrt(mean_squared_log_error(y_test, SF_daily_y_pred))))
print('The root mean square logarithmic error is {:.4f} with the best estimator using the am+pm trip data.'.format(np.sqrt(mean_squared_log_error(y_test, SF_ampm_y_pred))))
print('The root mean square logarithmic error is {:.4f} with the benchmark using the daily trip data.'.format(np.sqrt(mean_squared_log_error(y_test, y_pred_benchmark))))
The root mean square logarithmic error is 0.1814 with the best estimator using the daily trip data.
The root mean square logarithmic error is 0.1802 with the best estimator using the am+pm trip data.
The root mean square logarithmic error is 0.6481 with the benchmark using the daily trip data.

1.4 Explore the feature importance of the best model for daily, am and pm trips

In [216]:
# daily_importance = pd.Series(dict(zip(X_train.columns, bestRegressor.feature_importances_)))
daily_importance = bestRegressor.feature_importances_
In [217]:
# Plot the feature importances from the daily, am, pm trip models
# Matplotlib code borrowed from: https://chrisalbon.com/python/matplotlib_grouped_bar_plot.html
X_tick_labels = X_train.columns
pos = np.array(range(len(X_tick_labels)))

plt.figure(figsize=(20, 5))
ax = plt.subplot(111)
width = 0.25
ax.bar(pos-width, daily_importance, width=width, color='#EE3224', label='daily')
ax.bar(pos, SF_am_importance, width=width, color='#F78F1E', label='morning')
ax.bar(pos+width, SF_pm_importance,width=width,color='#FFC222', label='afternoon')
ax.legend()

# Set the y axis label
ax.set_ylabel('Feature Importance')

# Set the chart's title
ax.set_title('Feature Importance from Gradient Boosting Regressor')

# Set the position of the x ticks
ax.set_xticks(pos)

# Set the labels for the x ticks
ax.set_xticklabels(X_tick_labels, {'rotation': 'vertical', 'fontsize': 14})

# Adding the legend and showing the plot
plt.legend(loc='upper right')
plt.grid()
plt.show()

The importances have almost the same distribution among features for the three time periods (daily, am, pm).

Amazingly the wind degree and pressure are the two most important features for the Gradient Boosting Regressor. This is strange. I will explore the two features below.

In [218]:
daily_trip_count = daily_trips['BDay']['count']
daily_wind_degree = trips_SF['Wind_Deg']
daily_pressure = trips_SF['Pressure']
In [219]:
# Plot the daily trip count vs. the wind degree
plt.plot(daily_wind_degree, daily_trip_count, '.')
plt.xlabel('Wind Degree')
plt.ylabel('Daily Trip Count')
plt.title('Correlation between Trips and Wind Degree')
plt.show()
In [220]:
# Picture from this link: http://snowfence.umn.edu/Components/winddirectionanddegreeswithouttable3.htm
display(Image(filename='WindDirection.png', width=600))

240-320 degrees corespond to WSW, W, WNW, NW. So most of the time the wind comes from the north west.

In [221]:
plt.plot(daily_pressure, daily_trip_count, '.')
plt.xlabel('Sea Level Pressure')
plt.ylabel('Daily Trip Count')
plt.title('Correlation between Trips and Sea Level Pressure')
plt.show()

Similarly with the wind degree, the sea level pressure also concentrates in a short range.

Feature importance explanation.

The Gradient Boosting Regressor is essentially a decision tree method. So its importance can be explained by that for decision tree. http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

The feature importances. The higher, the more important the feature. The importance of a feature is computed as the (normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance R255.

In [222]:
importance_series = pd.Series(dict(zip(X_train.columns, daily_importance))).sort_values(ascending=False).reset_index()
importance_series.columns = ['feature_regular', 'importance_regular']
display(importance_series)
feature_regular importance_regular
0 BDay 0.181869
1 Wind_Deg 0.125949
2 Pressure 0.078783
3 Humid 0.077042
4 Temp 0.065730
5 Gust 0.063379
6 Dew 0.062518
7 Mean_Wind 0.050992
8 weekday 0.044346
9 Max_Wind 0.043712
10 Min_Vis 0.041195
11 month 0.040899
12 Cloud 0.031912
13 Precip 0.031217
14 Mean_Vis 0.020635
15 Fog_Rain 0.014649
16 total_docks 0.013795
17 Rain 0.004623
18 Fog 0.003419
19 before_Oct2013 0.003336
20 Max_Vis 0.000000
21 Thunder 0.000000

The wind degree and the sea level pressure have been explained above. These two features have very concentrated distribution in a short range.

The business day ranks the third and is the first feature that is not weather-related. The second non-weather feature is the weekday, ranking 9. The humidity, temperature and wind speed are undoubtfully important. Surprisingly, the cloud, precipitation, rain, fog features are not so important, which is contradicting to common sense. I can only explain it that these weather information may not apply to and have very limited influence on the whole day. For instance, it may fog or rain in the early morning or during night when the bike trips are rare.

In fact, the above interpretation may not be right. According to this link:

Just because a node is lower on the tree does not necessarily mean that it is less important. The feature importance in sci-kitlearn is calculated by how purely a node separates the classes (Gini index).

1.5 Test the performance treating the data as time series

In [223]:
# before_time_split = trips_SF.index < time_split
# after_time_split = trips_SF.index >= time_split

split_index = int(features_SF.shape[0] * 0.8) + 1
print('The splitting date is {}.'.format(features_SF.index[split_index]))

X_train_time = features_SF.iloc[:split_index]
y_train_time = counts_SF[:split_index]

X_test_time = features_SF[split_index:]
y_test_time = counts_SF[split_index:]
The splitting date is 2015-04-08.
In [224]:
print('The percentage of the training set is {:.1f}% of the entire dataset.'.format(len(X_train_time) / len(trips_SF) * 100))
The percentage of the training set is 80.1% of the entire dataset.
In [225]:
gbr_params
Out[225]:
{'learning_rate': [0.02, 0.05, 0.08],
 'max_depth': [8, 9, 10],
 'min_samples_leaf': [3, 4, 5],
 'n_estimators': [150, 200, 250]}
In [226]:
gbr_params_time = {'learning_rate': [0.05, 0.06, 0.07],
                   'max_depth': [5, 6, 7],
                   'min_samples_leaf': [1, 2, 3],
                   'n_estimators': [50, 100, 150]}
In [227]:
# Grid search
rgs_time = GradientBoostingRegressor(random_state=random_state)
cv_sets_time = TimeSeriesSplit(n_splits=15)
grid_time = GridSearchCV(estimator = rgs_time, param_grid = gbr_params_time, scoring = mae_scorer, cv = cv_sets_time, n_jobs=-1, verbose=1)
grid_time.fit(X_train_time, y_train_time)
    
print('The grid search of the Gradient Boosting Regressor gives the median absolute error {:.1f} for the best estimator.'.format((-1) * grid_time.best_score_))
Fitting 15 folds for each of 81 candidates, totalling 1215 fits
[Parallel(n_jobs=-1)]: Done 172 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 772 tasks      | elapsed:   54.7s
[Parallel(n_jobs=-1)]: Done 1212 out of 1215 | elapsed:  1.4min remaining:    0.2s
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  1.5min finished
The grid search of the Gradient Boosting Regressor gives the median absolute error 92.5 for the best estimator.
In [228]:
# Best model for time series training vs. regular training sets.
bestRegressor_time = grid_time.best_estimator_

print('The best regressor for time-series splitted training set is:')
print(bestRegressor_time)
print('\n')
print('The best regressor for regularly splitted training set is:')
print(bestRegressor)
The best regressor for time-series splitted training set is:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.06, loss='ls', max_depth=6, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=2,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=16,
             subsample=1.0, verbose=0, warm_start=False)


The best regressor for regularly splitted training set is:
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='ls', max_depth=9, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=5,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=200, presort='auto', random_state=16,
             subsample=1.0, verbose=0, warm_start=False)

As we can see above, the best regressor for a normal training is different than the time series training.

In [229]:
y_pred_time = bestRegressor_time.predict(X_test_time)
# Calculate the median value of the y train as the benchmark prediction
y_pred_benchmark_time =  np.ones(len(y_test_time)) * y_train_time.median()

print('The median absolute error is {:.1f} with the best estimator for the time-series splitted test set.'.format(median_absolute_error(y_test_time, y_pred_time)))
print('The median absolute error is {:.1f} with the benchmark for the time-series splitted test set.\n'.format(median_absolute_error(y_test_time, y_pred_benchmark_time)))
print('The root mean square logarithmic error is {:.4f} with the best estimator for the time-series splitted test set.'.format(np.sqrt(mean_squared_log_error(y_test_time, y_pred_time))))
print('The root mean square logarithmic error is {:.4f} with the benchmark for the time-series splitted test set.'.format(np.sqrt(mean_squared_log_error(y_test_time, y_pred_benchmark_time))))
The median absolute error is 82.9 with the best estimator for the time-series splitted test set.
The median absolute error is 332.0 with the benchmark for the time-series splitted test set.

The root mean square logarithmic error is 0.1484 with the best estimator for the time-series splitted test set.
The root mean square logarithmic error is 0.6226 with the benchmark for the time-series splitted test set.
In [230]:
print('The median absolute error is {:.1f} with the best estimator using the daily trip data.'.format(median_absolute_error(y_test, SF_daily_y_pred)))
print('The median absolute error is {:.1f} with the benchmark model using the daily trip data.'.format(median_absolute_error(y_test, y_pred_benchmark)))
print('The root mean square logarithmic error is {:.4f} with the best estimator using the daily trip data.'.format(np.sqrt(mean_squared_log_error(y_test, SF_daily_y_pred))))
print('The root mean square logarithmic error is {:.4f} with the benchmark using the daily trip data.'.format(np.sqrt(mean_squared_log_error(y_test, y_pred_benchmark))))
The median absolute error is 54.5 with the best estimator using the daily trip data.
The median absolute error is 260.0 with the benchmark model using the daily trip data.
The root mean square logarithmic error is 0.1814 with the best estimator using the daily trip data.
The root mean square logarithmic error is 0.6481 with the benchmark using the daily trip data.

1.6 Decision tree display

In [231]:
importance_time = pd.Series(dict(zip(X_train_time.columns, bestRegressor_time.feature_importances_))).sort_values(ascending=False).reset_index()
importance_time.columns = ['feature_time', 'importance_time']
display(pd.concat([importance_time, importance_series], axis=1))
feature_time importance_time feature_regular importance_regular
0 BDay 0.294956 BDay 0.181869
1 Wind_Deg 0.105962 Wind_Deg 0.125949
2 month 0.068793 Pressure 0.078783
3 Pressure 0.061170 Humid 0.077042
4 Humid 0.059834 Temp 0.065730
5 Temp 0.057173 Gust 0.063379
6 Dew 0.053723 Dew 0.062518
7 Gust 0.050886 Mean_Wind 0.050992
8 weekday 0.049667 weekday 0.044346
9 Max_Wind 0.046343 Max_Wind 0.043712
10 Cloud 0.030897 Min_Vis 0.041195
11 Mean_Wind 0.028482 month 0.040899
12 total_docks 0.025390 Cloud 0.031912
13 Precip 0.023881 Precip 0.031217
14 Min_Vis 0.021018 Mean_Vis 0.020635
15 Mean_Vis 0.010612 Fog_Rain 0.014649
16 Rain 0.005169 total_docks 0.013795
17 Fog 0.002119 Rain 0.004623
18 before_Oct2013 0.002092 Fog 0.003419
19 Fog_Rain 0.001831 before_Oct2013 0.003336
20 Max_Vis 0.000000 Max_Vis 0.000000
21 Thunder 0.000000 Thunder 0.000000

The feature importances for the time-series splitted dataset are nearly the same as those of the regularly splitted dataset.

In [232]:
# https://stackoverflow.com/questions/44974360/how-to-visualize-an-sklearn-gradientboostingclassifier
# Pick a decision tree number from the 50 estimators of the bestRegressor_time.
sub_tree_16 = bestRegressor_time.estimators_[16, 0]
In [233]:
# Display the decision tree map. Follow the instruction from the following link:
# https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176
# Change the node color following the instruction from this link:
# https://stackoverflow.com/questions/43214350/color-of-the-node-of-tree-with-graphviz-using-class-names/43218264#43218264

dot_data = StringIO()
export_graphviz(sub_tree_16, out_file=dot_data, max_depth=3,
                feature_names=X_train_time.columns,
                filled=True, rounded=True, rotate=True,
                proportion=True, special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph.write_png('sub_tree_16.png')
Image(graph.create_png(), width=800)
Out[233]:

One decision tree of the 50 estimators in the bestRegressor_time (Gradient Boosting Regressor) is displayed above with three depth layers to show the most important features. The higher the value of information, the darker the color.

Note that the feature importance of the regressor is ranked upon the votes from all 50 decision trees. Not every tree necessarily obeys the order of importance.

1.7 Feature importance ranking

http://scikit-learn.org/stable/modules/feature_selection.html http://blog.datadive.net/interpreting-random-forests/ The Gradient Boosting Regressor is a tree-based algorithm. It outputs the feature importance that can be used for feature selection by taking out the least important features. However, the ranking of the importance doesn't necessarily be the right order.

https://medium.com/@rnbrown/creating-and-visualizing-decision-trees-with-python-f8e8fa394176

In [234]:
# LassoCV implementation
# http://scikit-learn.org/stable/auto_examples/linear_model/plot_lasso_model_selection.html
lassocv_time = LassoCV(cv = cv_sets_time, random_state=random_state).fit(X_train_time, y_train_time)
print('The optimal Lasso model has the alpha {:.2f}.'.format(lassocv_time.alpha_))
The optimal Lasso model has the alpha 0.90.
In [235]:
# bestLasso = Lasso_grid_time.best_estimator_
y_pred_Lasso = lassocv_time.predict(X_test_time)

print('The median absolute error is {:.1f} with the best Lasso for the time-series splitted test set.'.format(median_absolute_error(y_test_time, y_pred_Lasso)))
print('The median absolute error is {:.1f} with the benchmark for the time-series splitted test set.\n'.format(median_absolute_error(y_test_time, y_pred_benchmark_time)))
print('The root mean square logarithmic error is {:.4f} with the best Lasso for the time-series splitted test set.'.format(np.sqrt(mean_squared_log_error(y_test_time, y_pred_Lasso))))
print('The root mean square logarithmic error is {:.4f} with the benchmark for the time-series splitted test set.'.format(np.sqrt(mean_squared_log_error(y_test_time, y_pred_benchmark_time))))
The median absolute error is 125.0 with the best Lasso for the time-series splitted test set.
The median absolute error is 332.0 with the benchmark for the time-series splitted test set.

The root mean square logarithmic error is 0.2196 with the best Lasso for the time-series splitted test set.
The root mean square logarithmic error is 0.6226 with the benchmark for the time-series splitted test set.

The results with Lasso regressor is worse than the Gradient Boosting Regressor, but still much better than the naive predictor. So it still makes sense to use the coefficients of the Lasso regressor for feature ranking and interpretation.

In [236]:
# Obtain the coefficients from the Lasso fitting.
importance_Lasso = pd.Series(dict(zip(X_train_time.columns, lassocv_time.coef_))).reset_index()
importance_Lasso['abs_coef'] = np.abs(importance_Lasso[0])
importance_Lasso.columns = ['features', 'coefficient', 'abs_coef']
importance_Lasso_sorted = importance_Lasso.sort_values('abs_coef', ascending=False).set_index('features')
In [237]:
importance_Lasso_sorted.coefficient
Out[237]:
features
BDay              589.905380
Precip           -553.159846
Temp              322.246582
Mean_Wind        -106.616824
Wind_Deg          103.313499
weekday           -99.839786
Rain              -81.428010
total_docks        77.619093
Fog_Rain          -76.690316
Dew                69.855156
Gust              -48.740686
before_Oct2013    -31.385932
Min_Vis            26.259021
Cloud             -19.759931
month              -9.211703
Humid               0.000000
Max_Vis            -0.000000
Max_Wind           -0.000000
Mean_Vis            0.000000
Pressure            0.000000
Fog                 0.000000
Thunder            -0.000000
Name: coefficient, dtype: float64

Lasso is a linear regression method. If all feature columns are normalized to (0, 1), then the fitted coefficients become comparable. A feature coefficient can be further adjusted by the center of the most frequently appearing values, i.e. the mean. Considering that only when a feature having nonzero value does it contribute to the final result, the mean of a feature column will be calculated after all zero values are removed from the column.

In [238]:
# Calculate the median of nonzero values in the feature column.
feature_mean = []
for col in X_train_time[importance_Lasso_sorted.index]:
    feature_col = X_train_time[col]
    feature_mean.append(feature_col.iloc[feature_col.nonzero()[0]].mean())

importance_Lasso_sorted['mean_adjusted'] = importance_Lasso_sorted['coefficient'] * feature_mean
importance_Lasso_sorted['abs_mean_adjusted'] = importance_Lasso_sorted['abs_coef'] * feature_mean
In [239]:
importance_Lasso_sorted.plot.bar(figsize=(20, 5), ylim=(-600,600), grid=True, fontsize=14)
plt.xlabel('')
plt.ylabel('Coefficient')
plt.show()

1.8 How does non-relevant feature removal impact the prediction?

In [240]:
# Collect all the features that have the coefficient equal to zero. These features are considered not very relevant and will be dropped.
features_to_drop = importance_Lasso_sorted.index[importance_Lasso_sorted.coefficient == 0].tolist()
print(features_to_drop)
['Humid', 'Max_Vis', 'Max_Wind', 'Mean_Vis', 'Pressure', 'Fog', 'Thunder']
In [241]:
X_train_time_drop = X_train_time.drop(features_to_drop, axis=1)
X_test_time_drop = X_test_time.drop(features_to_drop, axis=1)
In [242]:
X_train_time_drop.head(1)
Out[242]:
BDay weekday month before_Oct2013 Temp Dew Min_Vis Mean_Wind Gust Precip Cloud Wind_Deg Fog_Rain Rain total_docks
date
2013-08-29 True 0.5 0.636364 True 0.75 0.927273 1.0 0.5 0.301075 0.0 0.5 0.793872 0 0 0.0
In [243]:
gbr_params_time
Out[243]:
{'learning_rate': [0.05, 0.06, 0.07],
 'max_depth': [5, 6, 7],
 'min_samples_leaf': [1, 2, 3],
 'n_estimators': [50, 100, 150]}
In [244]:
gbr_params_drop = {'learning_rate': [0.06, 0.07, 0.08],
                   'max_depth': [5, 6, 7],
                   'min_samples_leaf': [1, 2, 3],
                   'n_estimators': [150, 200, 250]}
In [245]:
# Do grid search for Gradient Boosting Regressor again to find the optimal hyperparameters.
# With less features, it allows to allocate more computation resources to do the grid search.
rgs_time_drop = GradientBoostingRegressor(random_state=random_state)
grid_time_drop = GridSearchCV(estimator = rgs_time_drop, param_grid = gbr_params_drop, scoring = mae_scorer, cv = cv_sets_time, n_jobs=-1, verbose=1)
grid_time_drop.fit(X_train_time_drop, y_train_time)

print('The grid search of the Gradient Boosting Regressor with irrelevant features dropped gives the median absolute error {:.1f} for the best estimator.'.format((-1) * grid_time_drop.best_score_))
Fitting 15 folds for each of 81 candidates, totalling 1215 fits
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   39.5s
[Parallel(n_jobs=-1)]: Done 888 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1212 out of 1215 | elapsed:  2.1min remaining:    0.3s
[Parallel(n_jobs=-1)]: Done 1215 out of 1215 | elapsed:  2.1min finished
The grid search of the Gradient Boosting Regressor with irrelevant features dropped gives the median absolute error 87.4 for the best estimator.
In [246]:
# Compare with the full training set.
bestRegressor_time_drop = grid_time_drop.best_estimator_
y_pred_time_drop = bestRegressor_time_drop.predict(X_test_time_drop)

print('The median absolute error is {:.1f} with features dropped.'.format(median_absolute_error(y_test_time, y_pred_time_drop)))
print('The median absolute error is {:.1f} for the full time-series splitted test set.'.format(median_absolute_error(y_test_time, y_pred_time)))
print('The median absolute error is {:.1f} with the benchmark for the time-series splitted test set.\n'.format(median_absolute_error(y_test_time, y_pred_benchmark_time)))

print('The root mean square logarithmic error is {:.4f} with features dropped.'.format(np.sqrt(mean_squared_log_error(y_test_time, y_pred_time_drop))))
print('The root mean square logarithmic error is {:.4f} for the full time-series splitted test set.'.format(np.sqrt(mean_squared_log_error(y_test_time, y_pred_time))))
print('The root mean square logarithmic error is {:.4f} with the benchmark for the time-series splitted test set.'.format(np.sqrt(mean_squared_log_error(y_test_time, y_pred_benchmark_time))))
The median absolute error is 80.9 with features dropped.
The median absolute error is 82.9 for the full time-series splitted test set.
The median absolute error is 332.0 with the benchmark for the time-series splitted test set.

The root mean square logarithmic error is 0.1410 with features dropped.
The root mean square logarithmic error is 0.1484 for the full time-series splitted test set.
The root mean square logarithmic error is 0.6226 with the benchmark for the time-series splitted test set.
In [247]:
grid_time_drop.best_params_
Out[247]:
{'learning_rate': 0.07,
 'max_depth': 6,
 'min_samples_leaf': 1,
 'n_estimators': 200}

2. Classfication Problem for Subscriber Type Prediction

In [248]:
# Show the datasets.
subscriber_cls.head(3)
Out[248]:
duration subscription_type inter_group hour BDay weekday month local_zip before_Oct2013 Temp Dew Humid Pressure Max_Vis Mean_Vis Min_Vis Max_Wind Mean_Wind Gust Precip Cloud Wind_Deg Fog Fog_Rain Rain Thunder station_region_94041 station_region_94063 station_region_94107 station_region_94301 station_region_95113
0 63 Subscriber 0 14 True 3 8 True True 68.0 56.0 75.0 30.02 10.0 10.0 10.0 23.0 11.0 28.0 0.0 4.0 286.0 0 0 0 0 0 0 1 0 0
1 83 Subscriber 0 12 True 3 8 True True 68.0 56.0 75.0 30.02 10.0 10.0 10.0 23.0 11.0 28.0 0.0 4.0 286.0 0 0 0 0 0 0 1 0 0
2 103 Subscriber 0 18 True 3 8 True True 68.0 56.0 75.0 30.02 10.0 10.0 10.0 23.0 11.0 28.0 0.0 4.0 286.0 0 0 0 0 0 0 1 0 0

2.1 Feature data scaling

In [249]:
# Select feature columns that have numeric values except those with only 0 and 1. 
trip_num_features_cls = ['duration', 'hour', 'weekday', 'month', 'Temp', 'Dew', 'Humid', 'Pressure', 'Max_Vis', 'Mean_Vis', 'Min_Vis', 'Max_Wind', 'Mean_Wind', 'Gust', 'Precip', 'Cloud', 'Wind_Deg']
len(trip_num_features_cls)
Out[249]:
17
In [250]:
# Show the distribution of data for each feature.
subscriber_cls[trip_num_features_cls].plot(kind='box', subplots=True, layout=(3,6), figsize=(20, 10))
plt.show()

The duration feature has a very skewed data distribution. The logarithm will be applied to it before normalization.

In [251]:
# Apply the logarithm to the duration feature. Then scale all features to the range of (0, 1).
cls_scaler = MinMaxScaler() # Give this a name to use the inverse transform later
cls_scaled = subscriber_cls.copy()
cls_scaled['duration'] = np.log(cls_scaled['duration'])
cls_scaled[trip_num_features_cls] = cls_scaler.fit_transform(cls_scaled[trip_num_features_cls])
In [252]:
# Display the box-and-whisker plot for the scaled data. Now the data look more reasonable.
cls_scaled[trip_num_features_cls].plot(kind='box', figsize=(20, 5))
plt.show()

2.2 Generate training and testing sets

In [253]:
# Generate the feature (from the scaled data) and label columns.
features_cls = cls_scaled.drop('subscription_type', axis = 1)
subType_cls = (cls_scaled.subscription_type == 'Customer').astype('int')
In [254]:
features_cls.head(1) # Show the feature columns
Out[254]:
duration inter_group hour BDay weekday month local_zip before_Oct2013 Temp Dew Humid Pressure Max_Vis Mean_Vis Min_Vis Max_Wind Mean_Wind Gust Precip Cloud Wind_Deg Fog Fog_Rain Rain Thunder station_region_94041 station_region_94063 station_region_94107 station_region_94301 station_region_95113
0 0.007418 0 0.608696 True 0.5 0.636364 True True 0.652174 0.885246 0.708333 0.602041 0.333333 0.375 0.5 0.179688 0.478261 0.245614 0.0 0.5 0.794444 0 0 0 0 0 0 1 0 0
In [255]:
# Split the 'features_cls' and 'subType' data into training and testing sets
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(features_cls, subType_cls, test_size=0.2, random_state=random_state)

# Show the results of the split
print("Training set has {} samples.".format(X_train_cls.shape[0]))
print("Testing set has {} samples.".format(X_test_cls.shape[0]))
Training set has 535022 samples.
Testing set has 133756 samples.
In [256]:
X_train_cls.shape
Out[256]:
(535022, 30)
Set the scoring metric and the cross validation set.
In [257]:
# Set the f0.5 as the scorer with more focus on the precision to predict the non-subscriber. The goal of this study is to figure out a special pattern for non-subscriber.
beta = 0.5
f_score = make_scorer(fbeta_score, beta=beta)

# Define a cross validation set with smaller number of split than for the classifier as even the downsampled data is still one order of magnitude than the regression data.
# This will be used for full dataset training as well.
# StratifiedKFold is used instead of the regular KFold to preserve the percentage of samples for each class. This is important for imbalanced datasets.
cv_sets_cls = StratifiedKFold(n_splits=10, shuffle=True, random_state=random_state)

2.3 Downsample the training set for quick classifier exploration

In [258]:
# Define a function to downsample the data by removing adjacent similar samples.
# The squared sum of element-wise difference between adjacent samples is calculated. Only those data with the squared sum above the threshold will be kept.
# Iterate the process until the size of the data doesn't change more than min_diff from the previous run.
def downsample(data, threshold, min_diff):
    before_size = len(data)
    after_size = 0
    while(before_size - after_size > min_diff):
        before_size = len(data)
        data = data[data.diff().pow(2).sum(axis=1) > threshold]
        after_size = len(data)
    return data
In [259]:
# Divide the X_train_cls data by the subscription type
train_subs = X_train_cls[y_train_cls == 0]
train_cust = X_train_cls[y_train_cls == 1]
In [260]:
# Find the number of unique values in each column
unique_dict = {}
for col, data in X_train_cls.iteritems():
    unique_dict[col] = len(data.unique())
In [261]:
# Display the uniqueness of all columns
unique_dict
Out[261]:
{'BDay': 2,
 'Cloud': 10,
 'Dew': 69,
 'Fog': 2,
 'Fog_Rain': 2,
 'Gust': 46,
 'Humid': 84,
 'Max_Vis': 20,
 'Max_Wind': 50,
 'Mean_Vis': 26,
 'Mean_Wind': 25,
 'Min_Vis': 20,
 'Precip': 84,
 'Pressure': 81,
 'Rain': 2,
 'Temp': 48,
 'Thunder': 2,
 'Wind_Deg': 327,
 'before_Oct2013': 2,
 'duration': 13458,
 'hour': 24,
 'inter_group': 2,
 'local_zip': 2,
 'month': 12,
 'station_region_94041': 2,
 'station_region_94063': 2,
 'station_region_94107': 2,
 'station_region_94301': 2,
 'station_region_95113': 2,
 'weekday': 7}
In [262]:
# Sort the column by the order of uniqueness. The less unique values a column has, the earlier the column will get sorted.
# These features will contribute to square sum of difference substantially: even difference from one such feature will contribute 1 to the total.
# Sorting the data with these features will help downsample the data to a small size while maintaining the sample variance necessary for finding good classifiers.
col_sort = sorted(unique_dict, key=unique_dict.get)

subs_sort = train_subs.sort_values(col_sort)
cust_sort = train_cust.sort_values(col_sort)

Set the allowed deviation from each feature to be 0.03. This suggests that within a column, two values will be considered to have similar impact on the prediction of the label if the value difference is less than 3% of the total range. This method is not ideal: if only one feature changes while the others remain the same, the deviation of that particular feature can be up to 0.51 before being considered different. Most likely the duration impact will be weakened as it changes in the smallest time scale (minute). The larger the time scale, the less frequent the feature will change value.

Note that only columns with numeric values are counted. The deviation from features with only 0 and 1 values is just too big to be even considered. This means that all different combinations of these features will be kept.

In [263]:
len(trip_num_features_cls)
Out[263]:
17
In [264]:
dev = 0.03
threshold = (len(trip_num_features_cls) * dev) ** 2
print(threshold)
0.2601
In [265]:
%%time
# Don't set the min_diff too small, otherwise the downsample function will take a longer time to converge.
# Downsampling is done for data with different subscription type, respectively.
subs_sort_down = downsample(subs_sort, threshold=threshold, min_diff=10)
cust_sort_down = downsample(cust_sort, threshold=threshold, min_diff=10)
CPU times: user 4.08 s, sys: 600 ms, total: 4.68 s
Wall time: 4.41 s
In [266]:
# Combine the two downsized data to form the full data.
X_train_down = pd.concat([subs_sort_down, cust_sort_down])
y_train_down = y_train_cls.loc[X_train_down.index]

down_sample_size = len(X_train_down)
print('After downsampling, the data size shrinks by {:.1f}% to {}.'.format((1-down_sample_size/len(X_train_cls))*100, down_sample_size))
After downsampling, the data size shrinks by 98.1% to 10354.

2.4 Explore proper classifiers for the down-sampled training set

In [267]:
# The original functions comes from: https://www.kaggle.com/currie32/a-model-to-predict-number-of-daily-trips/notebook
# Use all CPUs for the computation.
# The mean is calculated instead of the median to be more comparable with the results from the GridSearchCV
def down_scoring(cls):
    scores = cross_val_score(cls, X_train_down, y_train_down, cv=cv_sets_cls, n_jobs=-1, scoring=f_score)
    return np.mean(scores)
In [268]:
%%time
# Check the performance of all regressors with default setting
# Too slow or gives error: GaussianProcessClassifier, RadiusNeighborsClassifier. 
classifiers = [DummyClassifier,
               AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier,
               LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier,
               GaussianNB,
               KNeighborsClassifier, NearestCentroid,
               MLPClassifier,
               LinearSVC, NuSVC, SVC,
               DecisionTreeClassifier, ExtraTreeClassifier,
               XGBClassifier]

cls_dict = {} # Create a classifier dictionary to record the classifier with its score
for cls in classifiers:
    begTime = time() # Get the beginning time
    
    # All parameters are set by default
    # Unfortunately, not all classifiers have a random_state classifier, so the ranking may change for different runs.
    try:
        cls_function = cls(random_state=random_state)
    except:
        cls_function = cls()
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        cls_score = down_scoring(cls_function)    
        
    useTime = time() - begTime
    
    cls_dict[cls.__name__] = (cls_score, useTime)
    print('The classifier {} takes {:.2f} seconds and has a F0.5 score of {:.3f}.'.format(cls.__name__, useTime, cls_score))    
The classifier DummyClassifier takes 0.93 seconds and has a F0.5 score of 0.442.
The classifier AdaBoostClassifier takes 4.55 seconds and has a F0.5 score of 0.777.
The classifier BaggingClassifier takes 4.66 seconds and has a F0.5 score of 0.800.
The classifier ExtraTreesClassifier takes 1.66 seconds and has a F0.5 score of 0.710.
The classifier GradientBoostingClassifier takes 9.69 seconds and has a F0.5 score of 0.810.
The classifier RandomForestClassifier takes 1.90 seconds and has a F0.5 score of 0.772.
The classifier LogisticRegression takes 1.65 seconds and has a F0.5 score of 0.772.
The classifier PassiveAggressiveClassifier takes 1.16 seconds and has a F0.5 score of 0.658.
The classifier RidgeClassifier takes 1.09 seconds and has a F0.5 score of 0.769.
The classifier SGDClassifier takes 1.18 seconds and has a F0.5 score of 0.730.
The classifier GaussianNB takes 0.96 seconds and has a F0.5 score of 0.663.
The classifier KNeighborsClassifier takes 4.30 seconds and has a F0.5 score of 0.696.
The classifier NearestCentroid takes 0.88 seconds and has a F0.5 score of 0.648.
The classifier MLPClassifier takes 88.53 seconds and has a F0.5 score of 0.790.
The classifier LinearSVC takes 6.01 seconds and has a F0.5 score of 0.775.
The classifier NuSVC takes 72.20 seconds and has a F0.5 score of 0.791.
The classifier SVC takes 42.76 seconds and has a F0.5 score of 0.772.
The classifier DecisionTreeClassifier takes 1.34 seconds and has a F0.5 score of 0.740.
The classifier ExtraTreeClassifier takes 0.95 seconds and has a F0.5 score of 0.647.
The classifier XGBClassifier takes 5.47 seconds and has a F0.5 score of 0.812.
CPU times: user 5.21 s, sys: 3.6 s, total: 8.8 s
Wall time: 4min 11s
In [269]:
df_cls = pd.DataFrame.from_dict(cls_dict, orient='index')
df_cls.columns = ['f_score', 'time_consumption']
df_cls.time_consumption = df_cls.time_consumption / 60 # Change the unit to minute
df_cls_sort = df_cls.sort_values('f_score')
In [270]:
df_cls_sort.plot.barh(figsize=(15, 10), fontsize=14)
plt.xlabel('F0.5_Score / Time Consumption (min)')
plt.show()

It is trickier to pick the right model for a large dataset. Both the performance and the complexity of a model will be considered: a high score is critical and a shorter training time is also important. A balance need to be found between them.

Three classifiers will be picked out. I will follow these strategies:

  1. First, classifiers performing better than the average will be picked up.
  2. Normalize the F0.5 score on the good classifier group.
  3. Create a formula to balance the performance and the time consumption.
  4. Give more emphasis on the performance. Make sure two of the top three classifiers also rank in the top three following the new criterion.
In [271]:
# First select those classifiers that perform at least better than the mean. Then normalize values of the selected classifiers.
df_cls_2 = df_cls_sort.loc[df_cls_sort.f_score > df_cls_sort.f_score.mean(), :]
df_cls_2['f_score_norm'] = minmax_scale(df_cls_2.f_score)
/home/radiowei/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
In [272]:
# Give more emphasis on the performane by taking the cubic of each value.
# Bagging Classifier ranks in the top three in both groups.
(df_cls_2.f_score_norm ** 3 / df_cls_2.time_consumption).sort_values(ascending=False)
Out[272]:
XGBClassifier                 10.972480
BaggingClassifier              8.121505
RidgeClassifier                5.959313
GradientBoostingClassifier     5.837991
LogisticRegression             5.109133
RandomForestClassifier         4.183174
AdaBoostClassifier             2.467964
LinearSVC                      1.677040
NuSVC                          0.350280
MLPClassifier                  0.268392
SVC                            0.187403
DecisionTreeClassifier         0.081103
SGDClassifier                  0.000000
dtype: float64

Downsampling the training set undoubtfully helps with excluding those classifiers that are too time-consuming. It also gives an idea of the best-performing classifiers. We will check later if a good classifier will do well on both the full dataset and the downsized dataset.

2.5 Find the optimal hyperparameters on the downsampled training data

2.5.1 For sklearn classifiers

In [273]:
# The XGBClassifier document can be found here: http://xgboost.readthedocs.io/en/latest/python/python_api.html
# Tuning instruction: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
bc_params = {'n_estimators':[10, 30, 50], 'max_samples':[0.4, 0.5, 0.6, 0.7]} # For Bagging Classifier, the number of estimators shouldn't be much larger than the default (10), otherwise the training time can be untolerable.
lr_params = {'penalty': ['l1', 'l2'], 'C': np.linspace(0.1, 1.0, 10)} # For Logistic Classifier
rc_params = {'alpha': np.linspace(0.1, 1.0, 20)} # For Ridge Classifier
In [274]:
# Initiate a dictionary to hold the best classifiers
best_cls = {}
In [275]:
# Randomized search will be utilized to save time as this dataset, though downsized, is still one order of magnitude than the regression set.
classifiers = [BaggingClassifier, LogisticRegression, RidgeClassifier] # Note that the classifier needs to be a function rather than the name itself.
cls_params = [bc_params, lr_params, rc_params]
n_iter_search = 10

for cls, cls_params in zip(classifiers, cls_params):
    begTime = time()
    
    grid_down = RandomizedSearchCV(estimator=cls(random_state=random_state), param_distributions=cls_params, n_iter=n_iter_search, scoring = f_score, cv = cv_sets_cls, n_jobs=-1, verbose=1)
    # The whole downsampled data is considered a "training set" for the full set, so all data will be used to capture the unique patterns.
    grid_down.fit(X_train_down, y_train_down)
    
    useTime = time() - begTime
    
    print('It takes {:.1f} seconds to grid search the classifier {}. The median F0.5 score is {:.3f}.'.format(useTime, cls.__name__, grid_down.best_score_))
    best_cls[cls.__name__] = grid_down.best_estimator_
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   30.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  1.4min finished
It takes 86.1 seconds to grid search the classifier BaggingClassifier. The median F0.5 score is 0.817.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   20.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   33.8s finished
It takes 35.4 seconds to grid search the classifier LogisticRegression. The median F0.5 score is 0.774.
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:    3.0s
It takes 7.1 seconds to grid search the classifier RidgeClassifier. The median F0.5 score is 0.769.
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    6.6s finished

2.5.2 For XGBClassifier

In on comparison run, the following data were recorded:

  • Using the CPU: 330 seconds
  • Using 'gpu_hist': 90 seconds
  • Using 'gpu_exact': 80 seconds

For small to medium dataset, exact greedy method is faster. For very large-dataset, 'hist' method is supposed to be faster. Using 'auto' actually makes the training slower.

http://xgboost.readthedocs.io/en/latest/python/python_api.html

eval_metric (str, callable, optional) – If a str, should be a built-in evaluation metric to use. See doc/parameter.md. If callable, a custom evaluation metric. The call signature is func(y_predicted, y_true) where y_true will be a DMatrix object such that you may need to call the get_label method. It must return a str, value pair where the str is a name for the evaluation and value is the value of the evaluation function. This objective is always minimized.

In [276]:
# Define a customized evaluation function for the xgboost cross valiation -- the F0.5 score.
# https://ajourneyintodatascience.quora.com/Custom-evaluation-function-and-early-stopping-for-xgboost-with-k-fold-validation-Python
def xgb_f_score(y_predicted, dtrain):
    y_true = dtrain.get_label() # Use get_label() to obtain the y_true. Label means the label column.
    y_pred = np.round(y_predicted) # The y_predicted is the sigmoid output. Rounding it will produce binary results.
    score = fbeta_score(y_true, y_pred, beta=0.5)

    return 'fbeta', score # Must return ('name', value)
In [277]:
# Don't set the n_jobs to -1 as the GPU will be used instead of the CPU.
# The mean score is calculated to be comparable with the results from the GridSearchCV.
def xgb_scoring(cls, X_train, y_train):
    scores = cross_val_score(cls, X_train, y_train, cv=cv_sets_cls, scoring=f_score)
    return np.mean(scores)
In [278]:
# First find the best number of estimators
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
# Use the customized evaluation function xgb_f_score. Remember to set the maximize to True.
# Set the folds to be cv_sets_cls so that the result is reproducible. cv_sets_cls is a stratified K-Fold.
# Note that the scoring is on the whole training dataset.
def modelfit(alg, X_train, y_train, useTrainCV=True, folds=cv_sets_cls, feval=xgb_f_score, stopping=50, verbose_eval=True):
    
    if useTrainCV:
        print('Start the cross validation...')
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(X_train.values, label=y_train.values) # Convert the pandas dataframe to the xgboost data format for the cv
        
        # Unfortunately the cross-validation method in xgboost doesn't allow f_score as the metric. AUC should be a similar measure with the f_score.
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], folds=folds, 
                          feval=feval, maximize=True, early_stopping_rounds=stopping, verbose_eval=verbose_eval) 
        
        alg.set_params(n_estimators=cvresult.shape[0])
    
    print('Start the scoring...')
    xgb_f_score = cvresult['test-fbeta-mean'].max()
    
    #Print model report:
    print("\nModel Report")
    print("F0.5 Score of the whole training data: {:.4f}. \n".format(xgb_f_score))
    print(alg)
    
    return xgb_f_score
Step 1: Fix learning rate and number of estimators for tuning tree-based parameters
In [279]:
%%time
# Initiate the XGBClassifier. Use 'gpu_exact' as this is a small dataset. Using 'gpu_hist' doesn't use as many GPU memory as the 'gpu_exact'
# Note that the number of estimators is very large. This is the maximal round that the modelfit can try to find the optimal number of estimators..
init_params = {'tree_method': 'gpu_exact', 'predictor': 'gpu_predictor'}
xgb_cls_0 = XGBClassifier(learning_rate= 0.1,
                          n_estimators= 1000,
                          max_depth= 5,
                          min_child_weight= 1,
                          gamma= 0,
                          subsample= 0.8,
                          colsample_bytree= 0.8,
                          reg_alpha= 0,
                          reg_lambda= 1,                          
                          scale_pos_weight= 1,
                          objective= 'binary:logistic',
                          random_state=random_state, **init_params)

# Find the best estimator number.
xgb_cls_0_score = modelfit(xgb_cls_0, X_train_down, y_train_down, folds=cv_sets_cls, verbose_eval=False)
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8153. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=39,
       n_jobs=1, nthread=None, objective='binary:logistic',
       predictor='gpu_predictor', random_state=16, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8, tree_method='gpu_exact')
CPU times: user 1.95 s, sys: 256 ms, total: 2.2 s
Wall time: 2.32 s
Step 2: Tune max_depth and min_child_weight
In [280]:
%%time
# Note the parameters: https://github.com/dmlc/xgboost/blob/master/doc/parameter.md
# Also check here: https://github.com/dmlc/xgboost/issues/2819
# Grid search the 'max_depth' and 'min_child_weight' hyperparameters for the XGBClassifier using GPU to improve the training speed.

xgb_params1={'max_depth': range(3,10,2), 'min_child_weight': range(1,6,2)}
xgb_grid_1 = GridSearchCV(xgb_cls_0, param_grid=xgb_params1, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_1.fit(X_train_down, y_train_down)    
xgb_cls_1 = xgb_grid_1.best_estimator_

xgb_cls_1_score = xgb_grid_1.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_1_score))
print(xgb_cls_1)
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   24.5s finished
F0.5 Score of the cross validation: 0.8235. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=9, min_child_weight=3, missing=None, n_estimators=39,
       n_jobs=1, nthread=None, objective='binary:logistic',
       predictor='gpu_predictor', random_state=16, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8, tree_method='gpu_exact')
CPU times: user 22 s, sys: 2.65 s, total: 24.7 s
Wall time: 24.7 s
In [281]:
xgb_cls_1_score - xgb_cls_0_score
Out[281]:
0.0081645946624087085
Tune max_depth and min_child_weight in finer step
In [282]:
%%time
xgb_params2 = {'max_depth':[6,7,8], 'min_child_weight':[4,5,6]}
xgb_grid_2 = GridSearchCV(xgb_cls_1, param_grid=xgb_params2, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_2.fit(X_train_down, y_train_down)
xgb_cls_2 = xgb_grid_2.best_estimator_

xgb_cls_2_score = xgb_grid_2.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_2_score))
print(xgb_cls_2)
Fitting 10 folds for each of 9 candidates, totalling 90 fits
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:   20.0s finished
F0.5 Score of the cross validation: 0.8246. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=8, min_child_weight=4, missing=None, n_estimators=39,
       n_jobs=1, nthread=None, objective='binary:logistic',
       predictor='gpu_predictor', random_state=16, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8, tree_method='gpu_exact')
CPU times: user 17.9 s, sys: 2.21 s, total: 20.1 s
Wall time: 20.2 s
In [283]:
xgb_cls_2_score - xgb_cls_1_score
Out[283]:
0.0011579264211251106
Re-do the modelfit to find the best number of estimators
In [284]:
# Copy all the parameters for xgb_cls_2 except setting the n_estimators to a large number.
xgb_cls_2_b = XGBClassifier(learning_rate= 0.1,
                            n_estimators= 1000,
                            max_depth= 7,
                            min_child_weight= 6,
                            gamma= 0,
                            subsample= 0.8,
                            colsample_bytree= 0.8,
                            reg_alpha= 0,
                            reg_lambda= 1,
                            scale_pos_weight=1,
                            objective= 'binary:logistic',                            
                            random_state=random_state, **init_params)

xgb_cls_2_b_score = modelfit(xgb_cls_2_b, X_train_down, y_train_down, folds=cv_sets_cls, verbose_eval=False)
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8219. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=7, min_child_weight=6, missing=None, n_estimators=27,
       n_jobs=1, nthread=None, objective='binary:logistic',
       predictor='gpu_predictor', random_state=16, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=0.8, tree_method='gpu_exact')
In [285]:
xgb_cls_2_b_score - xgb_cls_2_score
Out[285]:
-0.0026808544168671622
Step 3: Tune gamma
In [286]:
%%time
xgb_params3 = {'gamma':[i/10.0 for i in range(0,5)]}
xgb_grid_3 = GridSearchCV(xgb_cls_2_b, param_grid=xgb_params3, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_3.fit(X_train_down, y_train_down)
xgb_cls_3 = xgb_grid_3.best_estimator_

xgb_cls_3_score = xgb_grid_3.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_3_score))
print(xgb_cls_3)
Fitting 10 folds for each of 5 candidates, totalling 50 fits
F0.5 Score of the cross validation: 0.8250. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.2, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=27, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8, tree_method='gpu_exact')
CPU times: user 8.09 s, sys: 800 ms, total: 8.89 s
Wall time: 8.92 s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    8.8s finished
In [287]:
xgb_cls_3_score - xgb_cls_2_b_score
Out[287]:
0.0031036793923202577
Re-do the modelfit to find the best number of estimators
In [288]:
# Copy all the parameters for xgb_cls_2 except setting the n_estimators to a large number.
xgb_cls_3_b = XGBClassifier(learning_rate= 0.1,
                            n_estimators= 1000,
                            max_depth= 7,
                            min_child_weight= 6,
                            gamma= 0.4,
                            subsample= 0.8,
                            colsample_bytree= 0.8,
                            reg_alpha= 0,
                            reg_lambda= 1,
                            scale_pos_weight=1,
                            objective= 'binary:logistic',                            
                            random_state=random_state, **init_params)

xgb_cls_3_b_score = modelfit(xgb_cls_3_b, X_train_down, y_train_down, folds=cv_sets_cls, verbose_eval=False)
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8190. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=31, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.8, tree_method='gpu_exact')
In [289]:
xgb_cls_3_b_score - xgb_cls_3_score
Out[289]:
-0.0060466793923201756
Step 4: Tune subsample and colsample_bytree
In [290]:
%%time
xgb_params4 = {'subsample':[i/10.0 for i in range(6,10)], 'colsample_bytree':[i/10.0 for i in range(6,10)]}
xgb_grid_4 = GridSearchCV(xgb_cls_3_b, param_grid=xgb_params4, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_4.fit(X_train_down, y_train_down)
xgb_cls_4 = xgb_grid_4.best_estimator_

xgb_cls_4_score = xgb_grid_4.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_4_score))
print(xgb_cls_4)
Fitting 10 folds for each of 16 candidates, totalling 160 fits
F0.5 Score of the cross validation: 0.8229. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=31, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
CPU times: user 28 s, sys: 2.68 s, total: 30.7 s
Wall time: 31.1 s
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:   30.9s finished
In [291]:
xgb_cls_4_score - xgb_cls_3_b_score
Out[291]:
0.003876322111539543
Step 5: Tune regularization parameters
In [292]:
%%time
xgb_params5 = {'reg_alpha':[0.01, 0.1, 1, 10], 'reg_lambda':[0.01, 0.1, 1, 10]}
xgb_grid_5 = GridSearchCV(xgb_cls_4, param_grid=xgb_params5, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_5.fit(X_train_down, y_train_down)
xgb_cls_5 = xgb_grid_5.best_estimator_

xgb_cls_5_score = xgb_grid_5.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_5_score))
print(xgb_cls_5)
Fitting 10 folds for each of 16 candidates, totalling 160 fits
F0.5 Score of the cross validation: 0.8248. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=31, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=0.1, reg_lambda=0.01, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
CPU times: user 28.1 s, sys: 2.59 s, total: 30.7 s
Wall time: 30.7 s
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:   30.5s finished
In [293]:
xgb_cls_5_score - xgb_cls_4_score
Out[293]:
0.0019408240629785745
Tuning Regularization Parameters in a Smaller Step
In [294]:
%%time
xgb_params6 = {'reg_alpha':[0.5, 1, 2, 3], 'reg_lambda':[0.5, 1, 2, 3]}
xgb_grid_6 = GridSearchCV(xgb_cls_5, param_grid=xgb_params6, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_6.fit(X_train_down, y_train_down)
xgb_cls_6 = xgb_grid_6.best_estimator_

xgb_cls_6_score = xgb_grid_6.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_6_score))
print(xgb_cls_6)
Fitting 10 folds for each of 16 candidates, totalling 160 fits
F0.5 Score of the cross validation: 0.8252. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=31, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=2, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
CPU times: user 27.6 s, sys: 2.91 s, total: 30.6 s
Wall time: 30.6 s
[Parallel(n_jobs=1)]: Done 160 out of 160 | elapsed:   30.4s finished
In [295]:
xgb_cls_6_score - xgb_cls_5_score
Out[295]:
0.00037186195254124144
Re-do the modelfit to find the best number of estimators
In [296]:
xgb_cls_6_b = XGBClassifier(learning_rate= 0.1,
                            n_estimators= 1000,
                            max_depth= 7,
                            min_child_weight= 6,
                            gamma= 0.4,
                            subsample= 0.6,
                            colsample_bytree= 0.6,
                            reg_alpha= 1,
                            reg_lambda= 1,
                            scale_pos_weight=1,
                            objective= 'binary:logistic',                            
                            random_state=random_state, **init_params)

xgb_cls_6_b_score = modelfit(xgb_cls_6_b, X_train_down, y_train_down, folds=cv_sets_cls, verbose_eval=False)
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8226. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=51, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
In [297]:
xgb_cls_6_b_score - xgb_cls_6_score
Out[297]:
-0.0026086747937260624
Step 6: Tune scale_pos_weight for balancing the positive and negative weight
In [298]:
%%time
xgb_params7 = {'scale_pos_weight': np.linspace(0.1, 1, 10)}
xgb_grid_7 = GridSearchCV(xgb_cls_6_b, param_grid=xgb_params7, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_7.fit(X_train_down, y_train_down)
xgb_cls_7 = xgb_grid_7.best_estimator_

xgb_cls_7_score = xgb_grid_7.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_7_score))
print(xgb_cls_7)
Fitting 10 folds for each of 10 candidates, totalling 100 fits
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:   26.6s finished
F0.5 Score of the cross validation: 0.8365. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=51, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1,
       scale_pos_weight=0.40000000000000002, seed=None, silent=True,
       subsample=0.6, tree_method='gpu_exact')
CPU times: user 24.2 s, sys: 2.71 s, total: 26.9 s
Wall time: 26.9 s
In [299]:
xgb_cls_7_score - xgb_cls_6_b_score
Out[299]:
0.013970393987274621
Tuning scale_pos_weight in a Smaller Step
In [300]:
%%time
xgb_params8 = {'scale_pos_weight': np.arange(0.51, 0.69, 0.01)}
xgb_grid_8 = GridSearchCV(xgb_cls_7, param_grid=xgb_params8, scoring=f_score, cv=cv_sets_cls, verbose=1)
xgb_grid_8.fit(X_train_down, y_train_down)
xgb_cls_8 = xgb_grid_8.best_estimator_

xgb_cls_8_score = xgb_grid_8.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(xgb_cls_8_score))
print(xgb_cls_8)
Fitting 10 folds for each of 18 candidates, totalling 180 fits
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed:   48.2s finished
F0.5 Score of the cross validation: 0.8374. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=51, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1,
       scale_pos_weight=0.53000000000000003, seed=None, silent=True,
       subsample=0.6, tree_method='gpu_exact')
CPU times: user 43.7 s, sys: 4.64 s, total: 48.3 s
Wall time: 48.5 s
In [301]:
xgb_cls_8_score - xgb_cls_7_score
Out[301]:
0.00084468254962000167
Step 7: Optimizing Learning Rate
In [302]:
# Explore different learning rates using the modelfit, in place of the GridSearchCV.
xgb_cls_9 = {}
for lr in [i/10.0 for i in range(1,4)]:
    xgb_cls_temp = XGBClassifier(learning_rate= lr,
                                 n_estimators= 1000,
                                 max_depth= 7,
                                 min_child_weight= 6,
                                 gamma= 0.4,
                                 subsample= 0.6,
                                 colsample_bytree= 0.6,
                                 objective= 'binary:logistic',
                                 reg_alpha= 1,
                                 reg_lambda= 1,
                                 scale_pos_weight= 0.58,
                                 random_state=random_state, **init_params)
    
    xgb_temp_score = modelfit(xgb_cls_temp, X_train_down, y_train_down, folds=cv_sets_cls, verbose_eval=False)    
    xgb_cls_9[lr] = (xgb_cls_temp, xgb_temp_score)
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8315. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=41, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8316. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.2,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=17, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8322. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.3,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=11, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Tuning learning curve in a Smaller Step
In [303]:
# Explore different learning rates using the modelfit, in place of the GridSearchCV.
xgb_cls_10 = {}
for lr in [i/100.0 for i in range(12, 30, 2)]:
    xgb_cls_temp = XGBClassifier(learning_rate= lr,
                                 n_estimators= 1000,
                                 max_depth= 7,
                                 min_child_weight= 6,
                                 gamma= 0.4,
                                 subsample= 0.6,
                                 colsample_bytree= 0.6,
                                 objective= 'binary:logistic',
                                 reg_alpha= 1,
                                 reg_lambda= 1,
                                 scale_pos_weight= 0.58,
                                 random_state=random_state, **init_params)
    
    xgb_temp_score = modelfit(xgb_cls_temp, X_train_down, y_train_down, folds=cv_sets_cls, verbose_eval=False)    
    xgb_cls_10[lr] = (xgb_cls_temp, xgb_temp_score)
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8329. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.12,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=44, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8324. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.14,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=39, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8333. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.16,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=24, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8326. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.18,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=39, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8316. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.2,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=17, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8292. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.22,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=22, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8297. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.24,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=21, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8305. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.26,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=13, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
Start the cross validation...
Start the scoring...

Model Report
F0.5 Score of the whole training data: 0.8306. 

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.28,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=16, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
In [304]:
xgb_cls_10[0.2][1] - xgb_cls_8_score 
Out[304]:
-0.0057600765368946982
Step 8: Select the Best Predictor and Explore the Learning Curve
In [305]:
xgb_cls = xgb_cls_10[0.2][0]
In [306]:
# Define a function to calculate the mean and standard deviation of nonzero values in a numpy array.
def nonzero_mean(np_array):
    array_len = len(np_array)
    mean_data_nonzero = np.zeros(array_len)
    std_data_nonzero = np.zeros(array_len)
    
    for i in range(array_len):
        nonzero_array = np_array[i][np_array[i].nonzero()]
        if nonzero_array.any() == True:
            mean_data_nonzero[i] = np.mean(nonzero_array)
            std_data_nonzero[i] = np.std(nonzero_array)
    
    return mean_data_nonzero, std_data_nonzero
In [307]:
def plot_learning_curve(cls, X_train, y_train, train_sizes, scoring, cv=cv_sets_cls, verbose=1):

    sizes, train_scores, test_scores = learning_curve(cls, X_train, y_train, cv=cv, train_sizes=train_sizes, scoring=scoring, verbose=verbose)
    
    # Find the mean and standard deviation for smoothing.
    train_mean, train_std = nonzero_mean(train_scores)
    test_mean, test_std = nonzero_mean(test_scores)
    
    # Plot the learning curve 
    fig, ax = plt.subplots(figsize=(8, 10))
    plt.plot(sizes, train_mean, 'o-', color = 'r', label = 'Training Score')
    plt.plot(sizes, test_mean, 'o-', color = 'g', label = 'Testing Score')
    plt.fill_between(sizes, train_mean - train_std, train_mean + train_std, alpha = 0.15, color = 'r')
    plt.fill_between(sizes, test_mean - test_std, test_mean + test_std, alpha = 0.15, color = 'g')     
    
    plt.xlabel('Number of Training Samples')
    plt.ylabel(str(scoring))
    plt.title('Learning Performance of {}'.format(cls.__class__.__name__))
    
    minorLocator = AutoMinorLocator()
    ax.yaxis.set_minor_locator(minorLocator)
    
    plt.grid(which='both')
    plt.legend(loc='upper left')
    plt.show()
    
    # return train_scores
In [308]:
%%time
# Generate the training set sizes increasing exponentially
# train_sizes = np.geomspace(0.1, 1.0, num=9)
train_sizes = np.linspace(0.1, 1.0, num=9)

# Suppress the warnings due to an "ill-defined" f score
# https://stackoverflow.com/questions/29086398/sklearn-turning-off-warnings
# https://docs.python.org/2/library/warnings.html#temporarily-suppressing-warnings
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    plot_learning_curve(xgb_cls, X_train_down, y_train_down, train_sizes=train_sizes, 
                        cv=cv_sets_cls, scoring=f_score, verbose=1)
[learning_curve] Training set sizes: [ 931 1980 3028 4076 5124 6173 7221 8269 9318]
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    7.5s finished
CPU times: user 7.21 s, sys: 592 ms, total: 7.8 s
Wall time: 7.78 s
In [309]:
%%time
# Explore the accuracy learning curve.
plot_learning_curve(xgb_cls, X_train_down, y_train_down, train_sizes=train_sizes, 
                    cv=cv_sets_cls, scoring='accuracy', verbose=1)
[learning_curve] Training set sizes: [ 931 1980 3028 4076 5124 6173 7221 8269 9318]
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:    7.3s finished
CPU times: user 6.98 s, sys: 680 ms, total: 7.66 s
Wall time: 7.65 s

It is very interesting to see how many training samples are required in order to obtain good performance on the the F0.5 score. We may have just enough samples for the training. On the other hand, additional training on the full dataset may still have some improvement. The learning curve will be investigated on the full dataset as well.

2.6 Evaluate the classifiers on the entire datasets

In [310]:
xgb_cls
Out[310]:
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.6, gamma=0.4, learning_rate=0.2,
       max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
       n_estimators=17, n_jobs=1, nthread=None,
       objective='binary:logistic', predictor='gpu_predictor',
       random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
       seed=None, silent=True, subsample=0.6, tree_method='gpu_exact')
In [311]:
# Change the  use 'gpu_hist' instead of 'gpu_exact'.
init_params_full = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
xgb_cls = XGBClassifier(learning_rate= 0.2,
                        n_estimators= 16,
                        max_depth= 7,
                        min_child_weight= 6,
                        gamma= 0.4,
                        subsample= 0.6,                         
                        colsample_bytree= 0.6,
                        reg_alpha= 1,
                        reg_lambda= 1,
                        objective= 'binary:logistic',
                        scale_pos_weight= 0.58,
                        random_state=random_state, **init_params_full)

best_cls[XGBClassifier.__name__] = xgb_cls
In [312]:
best_cls
Out[312]:
{'BaggingClassifier': BaggingClassifier(base_estimator=None, bootstrap=True,
          bootstrap_features=False, max_features=1.0, max_samples=0.7,
          n_estimators=50, n_jobs=1, oob_score=False, random_state=16,
          verbose=0, warm_start=False),
 'LogisticRegression': LogisticRegression(C=0.59999999999999998, class_weight=None, dual=False,
           fit_intercept=True, intercept_scaling=1, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l1', random_state=16,
           solver='liblinear', tol=0.0001, verbose=0, warm_start=False),
 'RidgeClassifier': RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
         max_iter=None, normalize=False, random_state=16, solver='auto',
         tol=0.001),
 'XGBClassifier': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.6, gamma=0.4, learning_rate=0.2,
        max_delta_step=0, max_depth=7, min_child_weight=6, missing=None,
        n_estimators=16, n_jobs=1, nthread=None,
        objective='binary:logistic', predictor='gpu_predictor',
        random_state=16, reg_alpha=1, reg_lambda=1, scale_pos_weight=0.58,
        seed=None, silent=True, subsample=0.6, tree_method='gpu_hist')}
In [313]:
def train_predict(learner, X_train, y_train, X_test, y_test, best=False): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    begTime = time()
    
    results = {}
    
    # Fit the learner to the training data
    start = time() # Get start time
    learner = learner.fit(X_train, y_train)
    end = time() # Get end time
    
    # Calculate the total training time
    results['train_time'] = end - start
        
    # Get the predictions on the test set(X_test), then get predictions on all training samples(X_train) using .predict()
    pred_train_size = int(1 * len(X_train))
    
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:pred_train_size])
    end = time() # Get end time
    
    # Calculate the total prediction time
    results['pred_time'] = end - start
            
    # Compute accuracy on all training samples
    results['acc_train'] = accuracy_score(y_train[:pred_train_size], predictions_train)
        
    # Compute accuracy on test set using accuracy_score()
    results['acc_test'] = accuracy_score(y_test, predictions_test)
    
    # Compute F0.5-score on all training samples using fbeta_score()
    results['f_train'] = fbeta_score(y_train[:pred_train_size], predictions_train, beta=0.5)
        
    # Compute F0.5-score on the test set which is y_test
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)
       
    useTime = time() - begTime
    # Success
    if best == True:
        print("Training with the classifier {} takes {:.1f} seconds.".format((learner.__class__.__name__ + '_best'), useTime))
    if best == False:
        print("Training with the classifier {} takes {:.1f} seconds.".format(learner.__class__.__name__, useTime))

    # Return the results
    return results
In [314]:
clf_A = XGBClassifier(random_state=random_state, **init_params)
clf_A_best = best_cls['XGBClassifier']

clf_B = BaggingClassifier(random_state=random_state)
clf_B_best = best_cls['BaggingClassifier']

clf_C = LogisticRegression(random_state=random_state)
clf_C_best = best_cls['LogisticRegression']

clf_D = RidgeClassifier(random_state=random_state)
clf_D_best = best_cls['RidgeClassifier']


# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C, clf_D]:    
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    results[clf_name]['default'] = train_predict(clf, X_train_cls, y_train_cls, X_test_cls, y_test_cls)

    
for clf in [clf_A_best, clf_B_best, clf_C_best, clf_D_best]:    
    clf_name = clf.__class__.__name__
    results[clf_name]['best'] = train_predict(clf, X_train_cls, y_train_cls, X_test_cls, y_test_cls, best=True)        
Training with the classifier XGBClassifier takes 14.3 seconds.
Training with the classifier BaggingClassifier takes 83.7 seconds.
Training with the classifier LogisticRegression takes 11.6 seconds.
Training with the classifier RidgeClassifier takes 3.4 seconds.
Training with the classifier XGBClassifier_best takes 5.8 seconds.
Training with the classifier BaggingClassifier_best takes 326.7 seconds.
Training with the classifier LogisticRegression_best takes 58.9 seconds.
Training with the classifier RidgeClassifier_best takes 3.2 seconds.

2.7 Compare the default and optimized (using the downsampled training set) classifiers

In [315]:
# Visualization function borrowed from the 'finding_donors' project.
# Modify the funtion to display the performance of each classifier with its default setting and the optimized hyper-parameters using the downsampled training set.
def evaluate(results, accuracy, f1):
    """
    Visualization code to display results of various learners.
    
    inputs:
      - learners: a list of supervised learners
      - stats: a list of dictionaries of the statistic results from 'train_predict()'
      - accuracy: The score for the naive predictor
      - f1: The score for the naive predictor
    """
  
    # Create figure
    fig, ax = plt.subplots(2, 3, figsize = (12, 8))

    # Constants
    bar_width = 0.47
    # Check the color code from Google. https://www.google.com/search?q=color+%2300A0A0&oq=color+%2300A0A0&aqs=chrome..69i57.1192j0j7&sourceid=chrome&ie=UTF-8
    # colors = ['#A00000','#08A000','#0092A0', '#8A00A0']
    # colors = ['#E0F794','#F9EB90','#92B9FC', '#C092FC']
    colors = ['#8fb21e','#ffdd00','#79a9fc','#8b6ab7']
    
    # Super loop to plot four panels of data
    for k, learner in enumerate(results.keys()):
        for j, metric in enumerate(['train_time', 'acc_train', 'f_train', 'pred_time', 'acc_test', 'f_test']):
            for i, label in enumerate(['default', 'best']):
                # Creative plot code
                ax[j//3, j%3].bar(k+i*1.05*bar_width, results[learner][label][metric], width = bar_width, color = colors[k])
                ax[j//3, j%3].set_xlabel("Classifier('default', 'best')")
                ax[j//3, j%3].set_xlim((-0.5, 4.0))
    
    # Add unique y-labels
    ax[0, 0].set_ylabel("Time (in seconds)")
    ax[0, 1].set_ylabel("Accuracy Score")
    ax[0, 2].set_ylabel("F0.5-score")
    ax[1, 0].set_ylabel("Time (in seconds)")
    ax[1, 1].set_ylabel("Accuracy Score")
    ax[1, 2].set_ylabel("F0.5-score")
    
    # Add titles
    ax[0, 0].set_title("Model Training")
    ax[0, 1].set_title("Accuracy Score on Training Subset")
    ax[0, 2].set_title("F0.5-score on Training Subset")
    ax[1, 0].set_title("Model Predicting")
    ax[1, 1].set_title("Accuracy Score on Testing Set")
    ax[1, 2].set_title("F0.5-score on Testing Set")
    
    # Add horizontal lines for naive predictors
    ax[0, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 1].axhline(y = accuracy, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[0, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 2].axhline(y = f1, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    
    # Add horizontal lines for the best predictors
    acc_test = []
    acc_train = []
    f_test = []
    f_train = []
    for _, result in results.items():
        for _, values in result.items():
            acc_test.append(values['acc_test'])
            acc_train.append(values['acc_train'])
            f_test.append(values['f_test'])
            f_train.append(values['f_train'])    
    
    ax[0, 1].axhline(y = max(acc_train), xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dotted')
    ax[0, 1].axhline(y = max(acc_test), xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'r', linestyle = 'dotted')
    ax[1, 1].axhline(y = max(acc_test), xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dotted')
    ax[0, 2].axhline(y = max(f_train), xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dotted')
    ax[0, 2].axhline(y = max(f_test), xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'r', linestyle = 'dotted')   
    ax[1, 2].axhline(y = max(f_test), xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dotted')
    
    
    # Set y-limits for score panels
    ax[0, 1].set_ylim((0, 1))
    ax[0, 2].set_ylim((0, 1))
    ax[1, 1].set_ylim((0, 1))
    ax[1, 2].set_ylim((0, 1))

    # Create patches for the legend
    patches = []
    for i, learner in enumerate(results.keys()):
        patches.append(mpatches.Patch(color = colors[i], label = learner))
    plt.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
               loc = 'upper center', borderaxespad = 0., ncol = 4, fontsize = 'x-large')
    
    # Aesthetics
    plt.suptitle("Performance Metrics for Four Supervised Learning Models", fontsize = 16, y = 1.10)
    plt.tight_layout()
    plt.show()
In [316]:
# Calculate the accuracy of a naive predictor that always predicts the subscriber as a benchmark for the accuracy.
# Calculate the f0.5 score of a naive predictor that always predicts the non-subscriber as a benchmark for the f0.5 score.
accuracy = np.sum(subType_cls)/float(subType_cls.count()) # If the classifier predicts all to be non-subscribers.
subscriber_accuracy = 1 - accuracy # If the classifier predicts all to be subscribers.
recall = 1
precision = accuracy

# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
# HINT: The formula above can be written as (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
beta = 0.5
fscore = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

# Run metrics visualization for the three supervised learning models chosen
evaluate(results, subscriber_accuracy, fscore)

The above graph shows:

  1. The classifier optimization based on the downsampled training set helps improve both the accuracy and f0.5 score on the testing set for the two complex models (bagging and xgboost). The performance on the training set becomes worse, but this is a good sign as that suggests the optimized model tries not to overfit the training set. The performance of the logistic regression and the ridge regression doesn't change significantly before and after the optimzation.

  2. The red dotted line on the training set plot labels the best score on the testing set. We can see that the Bagging classifier tends to overfit, while the other three seem to perform well, with the logistic regression and the ridge classifier a bit underfitting.

  3. In terms of the overall performance, the xgboost is the best. The ridge regression ranks the third in F0.5 score, but the training and the predicting are pretty quick. So these two will be selected to be optimized further directly on the full dataset.

  4. The Bagging classifier takes too much time for the training and the predicting. Not a good classifier. The xgboost classifier is doing very well in time. With the power of the GPU, its training time is even shorter than the logistic regression; its predicting time is very close to both the logistic regression and the ridge classifier.

2.8 Optimize the best classifiers on the entire dataset

Optimizing the XGBClassifier

If the data is larger and more complex, to obtain the best model:

  1. The max_depth will be deeper / larger number.
  2. The learning rate is smaller.
  3. The number of estimator tends to be larger. Don't set that too high, otherwise the training speed is slower (almost a linear scale).
  4. The scale_pos_weight is an important parameter for imbalanced data, which is typical. Always use the stratified K-fold for the cross validation so that different splits of data will be similarly imbalanced.
  5. The gamma tends to be smaller.
  6. The subsample and colsample_bytree become more important.
  7. The reg_alpha and reg_lambda values can have more possibility, rather than just the plain 0 or 1.
  8. If the data is extremely imbalanced, max_delta_step should be tuned.

Below is the XGBClassifier optimized for the entire dataset. Almost all parameters are different than that from the one trained on the downsampled dataset. The performance gets enhanced a lot.

In [317]:
# For the full dataset, use 'gpu_hist' instead of 'gpu_exact'.
init_params_full = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
xgb_full = XGBClassifier(learning_rate= 0.05,
                         n_estimators=250,
                         max_depth= 9,
                         min_child_weight= 5,
                         gamma= 0,
                         subsample= 0.9,                         
                         colsample_bytree= 0.6,
                         reg_alpha= 0.5,
                         reg_lambda= 2,
                         objective= 'binary:logistic',
                         scale_pos_weight= 0.52,
                         random_state=random_state, **init_params_full)
In [318]:
np.geomspace(0.01, 1.0, num=9) * len(y_train_cls)
Out[318]:
array([   5350.22      ,    9514.18606518,   16918.88118299,
         30086.4980486 ,   53502.2       ,   95141.86065178,
        169188.81182986,  300864.98048599,  535022.        ])
In [319]:
# Generate the training set sizes increasing exponentially. The whole process takes about 10 min.
train_sizes = np.geomspace(0.01, 1.0, num=9)
plot_learning_curve(xgb_full, X_train_cls, y_train_cls, train_sizes=train_sizes, 
                    cv=cv_sets_cls, scoring=f_score, verbose=1)
[learning_curve] Training set sizes: [  4815   8562  15226  27077  48151  85627 152269 270778 481519]
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  9.6min finished

It is rather strange why XGBClasifier is doing this. The testing score seems to keep increasing with the number of training samples. The training score is doing normally when the number of training samples is relatively small (~ 50000), but the variance can become very large when there are more training samples. Is this the nature of the XGBClassifier?

Optimizing the Ridge Classifier
In [320]:
%%time
# Find the best alpha for ridge classifier. The RidgeClassifierCV cannot be used as it will report an error for the f0.5 score.
ridge_full = RidgeClassifier(random_state=random_state)

rc_params_full = {'alpha': np.linspace(0.1, 1.0, 5)}
# Remember to set the n_jobs to -1 to take the advantage of multiple CPUs.
rc_grid_full = GridSearchCV(ridge_full, param_grid=rc_params_full, scoring=f_score, n_jobs=-1, cv=cv_sets_cls, verbose=1)
rc_grid_full.fit(X_train_cls, y_train_cls)    
rc_full = rc_grid_full.best_estimator_

rc_full_score = rc_grid_full.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(rc_full_score))
print(rc_full)
Fitting 10 folds for each of 5 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.6min finished
F0.5 Score of the cross validation: 0.8080. 

RidgeClassifier(alpha=0.55000000000000004, class_weight=None, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=False,
        random_state=16, solver='auto', tol=0.001)
CPU times: user 8.84 s, sys: 756 ms, total: 9.6 s
Wall time: 1min 41s

No change in the alpha value, so the Ridge classifier doesn't get improved. Below I will tune the parameter in a finer step.

In [321]:
%%time
rc_params_full_b = {'alpha': np.linspace(0.8, 1.0, 5)}
# Remember to set the n_jobs to -1 to take the advantage of multiple CPUs.
rc_grid_full_b = GridSearchCV(ridge_full, param_grid=rc_params_full_b, scoring=f_score, n_jobs=-1, cv=cv_sets_cls, verbose=1)
rc_grid_full_b.fit(X_train_cls, y_train_cls)    
rc_full_b = rc_grid_full.best_estimator_

rc_full_b_score = rc_grid_full_b.best_score_
print('F0.5 Score of the cross validation: {:.4f}. \n'.format(rc_full_b_score))
print(rc_full_b)
Fitting 10 folds for each of 5 candidates, totalling 50 fits
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.6min finished
F0.5 Score of the cross validation: 0.8080. 

RidgeClassifier(alpha=0.55000000000000004, class_weight=None, copy_X=True,
        fit_intercept=True, max_iter=None, normalize=False,
        random_state=16, solver='auto', tol=0.001)
CPU times: user 8.8 s, sys: 780 ms, total: 9.58 s
Wall time: 1min 40s

No change in the alpha value, so the Ridge classifier doesn't get improved. rc_full is the same as rc_full_b.

In [322]:
# Generate the training set sizes increasing exponentially.
train_sizes = np.geomspace(0.01, 1.0, num=9)
plot_learning_curve(rc_full, X_train_cls, y_train_cls, train_sizes=train_sizes, cv=cv_sets_cls, scoring=f_score, verbose=1)
[learning_curve] Training set sizes: [  4815   8562  15226  27077  48151  85627 152269 270778 481519]
[Parallel(n_jobs=1)]: Done  90 out of  90 | elapsed:  1.1min finished

The learning curve for the RidgeClassifier seems more normal than that for the XGBClassifier. When the training sample size is relatively small, there tends to be more variation for the testing score, then it gets improved. The training and testing scores converge at large number of training samples.

2.9 Prediction on the testing set

In [323]:
%%time
# Fit the training set with the best classifiers and predict on the X_test_cls data.
xgb_cls.fit(X_train_cls, y_train_cls)
y_pred_xgb_down = xgb_cls.predict(X_test_cls)
fscore_xgb_down = fbeta_score(y_test_cls, y_pred_xgb_down, beta=0.5)
print('The F0.5 score on the testing set with trained model from the downsampled dataset is {:.4f}.'.format(fscore_xgb_down))

xgb_full.fit(X_train_cls, y_train_cls)
y_pred_xgb_full = xgb_full.predict(X_test_cls)
fscore_xgb_full = fbeta_score(y_test_cls, y_pred_xgb_full, beta=0.5)
print('The F0.5 score on the testing set with the optimized XGBClassifier is {:.4f}.'.format(fscore_xgb_full))

rc_full.fit(X_train_cls, y_train_cls)
y_pred_rc_full = rc_full.predict(X_test_cls)
fscore_rc_full = fbeta_score(y_test_cls, y_pred_rc_full, beta=0.5)
print('The F0.5 score on the testing set with the optimized RidgeClassifier is {:.4f}.'.format(fscore_rc_full))
The F0.5 score on the testing set with trained model from the downsampled dataset is 0.8354.
The F0.5 score on the testing set with the optimized XGBClassifier is 0.8519.
The F0.5 score on the testing set with the optimized RidgeClassifier is 0.8073.
CPU times: user 22.8 s, sys: 3.55 s, total: 26.3 s
Wall time: 26.3 s
In [324]:
# Plot the confusion matrix with the log scale. The two classifiers have similar graphs, so I plot only one.
# http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html#sphx-glr-auto-examples-model-selection-plot-confusion-matrix-py
cm = confusion_matrix(y_test_cls, y_pred_xgb_full)

classes = ['Subscriber', 'Non-Subscriber']

cmap = plt.cm.Blues
plt.imshow(np.log10(cm), interpolation='nearest', cmap=cmap)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)

plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

print(cm)
[[112260   1014]
 [  7357  13125]]
In [325]:
# Show the classification report for both classifiers for comparison
print('A classification report for XGBClassifier trained with the downsampled: ')
print(classification_report(y_test_cls, y_pred_xgb_down))
print('\n')

print('A classification report for XGBClassifier: ')
print(classification_report(y_test_cls, y_pred_xgb_full))
print('\n')

print('A classification report for RidgeClassifier: ')
print(classification_report(y_test_cls, y_pred_rc_full))
A classification report for XGBClassifier trained with the downsampled: 
             precision    recall  f1-score   support

          0       0.93      0.99      0.96    113274
          1       0.93      0.59      0.72     20482

avg / total       0.93      0.93      0.92    133756



A classification report for XGBClassifier: 
             precision    recall  f1-score   support

          0       0.94      0.99      0.96    113274
          1       0.93      0.64      0.76     20482

avg / total       0.94      0.94      0.93    133756



A classification report for RidgeClassifier: 
             precision    recall  f1-score   support

          0       0.93      0.98      0.96    113274
          1       0.88      0.61      0.72     20482

avg / total       0.92      0.93      0.92    133756

2.10 Explore Feature Importances for Each Classifier

Plot XGBClassifier Importances
In [326]:
# Plot the importance using xgboost's function. By Gain.
plt.figure(figsize = (20, 15))
ax = plt.subplot()
xgb.plot_importance(xgb_full, ax=ax, importance_type='gain', xlabel='The number of times a feature appears in a tree', height=0.5, grid=False)
xgb_importance_gain = pd.Series(xgb_full.get_booster().get_score(importance_type='gain'))
plt.show()
In [327]:
# Plot the importance using xgboost's function. By Cover.
plt.figure(figsize = (20, 15))
ax = plt.subplot()
xgb.plot_importance(xgb_full, ax=ax, importance_type='cover', xlabel='The number of times a feature appears in a tree', height=0.5, grid=False)
xgb_importance_cover = pd.Series(xgb_full.get_booster().get_score(importance_type='cover'))
plt.show()
In [328]:
# Plot the importance using xgboost's function. By Weight.
plt.figure(figsize = (20, 15))
ax = plt.subplot()
xgb.plot_importance(xgb_full, ax=ax, importance_type='weight', xlabel='The number of times a feature appears in a tree', height=0.5, grid=False)
xgb_importance_weight = pd.Series(xgb_full.get_booster().get_score(importance_type='weight'))
plt.show()
In [329]:
# Concatenate all the importance series together to form a dataframe
xgb_importance_df = pd.concat([xgb_importance_weight, xgb_importance_gain, xgb_importance_cover], axis=1)
xgb_importance_df.columns = ['Weight', 'Gain', 'Cover']
In [330]:
# Change the gain and cover scales to logarithm to minimize the data skew.
xgb_importance_df['LogGain'] = np.log10(xgb_importance_df.Gain)
xgb_importance_df['LogCover'] = np.log(xgb_importance_df.Cover)
In [331]:
# Normalize the three importances to 
xgb_importance_scaled = pd.DataFrame(MinMaxScaler().fit_transform(xgb_importance_df[['Weight', 'LogGain', 'LogCover']]))
xgb_importance_scaled.index = xgb_importance_df.index
xgb_importance_scaled.columns = ['Weight', 'LogGain', 'LogCover']
In [332]:
xgb_importance_scaled.sort_values('LogGain', ascending=False).plot.bar(figsize=(20, 5), grid=True, fontsize=14, )
plt.ylabel('Normalized Importance')
plt.show()
In [333]:
# Plot a decision tree from the xgboost following the instruction below:
# https://machinelearningmastery.com/visualize-gradient-boosting-decision-trees-xgboost-python/
# A pdf file will be opened to see the details. 
# The xgboost plot function for python cannot specify the maximum depth to plot.

tree_plot = xgb.to_graphviz(xgb_full, num_trees=16, rankdir='LR')
# tree_plot.view()
In [334]:
def dot_to_df(tree_body):
    '''Extract the nodes information from a DOT file. The function returns a pandas series with two columns: 
       left nodes and right nodes'''
    
    edge_list = []
    for line in tree_body:
        try:
            edge_left, edge_right = line.split('->')[0:2]
        except:
            continue
            
        node_left = int(edge_left)
        node_right = int(edge_right.split()[0])
        
        edge_list.append((node_left, node_right))
        
    edge_df = pd.DataFrame(edge_list)
    edge_df.columns = ['left_node', 'right_node']
    
    return edge_df
In [335]:
def nodes_depth(tree_body, depth = 3):
    '''This function collects the node indices up to the designated depths.'''
    # Convert the DOT body code to a pandas DataFrame with the edge (node pair)
    edge_df = dot_to_df(tree_body)
       
    d_nodes = [[0]]
    for d in range(0, depth):
        next_node = edge_df.loc[edge_df.left_node.isin(d_nodes[d]), 'right_node'].tolist()
        d_nodes.append(next_node)
        
    return [node_point for d_node in d_nodes for node_point in d_node]
In [336]:
def change_depth(tree_body, depth = 3):
    '''This function changes the depth of a decision tree to display via graphviz.
       It returns a new DOT code with the designated depth for a decision tree.'''
    
    nodes_deep = nodes_depth(tree_body, depth = depth)
    nodes_deeper = nodes_depth(tree_body, depth = depth + 1)
    nodes_diff = list(set(nodes_deeper)^set(nodes_deep))

    keep_line_idx = []
    for idx, line in enumerate(tree_body):
        item_list = line.split()
        left_node = int(item_list[0])

        if left_node in nodes_deep:
            keep_line_idx.append(idx)
        elif (left_node in nodes_diff) and (item_list[1] != '->'):
            keep_line_idx.append(idx)
    
    new_lines = [tree_body[i] for i in keep_line_idx]
    new_string = '\n'.join(new_lines)
    
    new_dot = '\n'.join(['digraph {', '\tgraph [rankdir=LR]', new_string, '}'])
    
    return new_dot
In [337]:
# Create new DOT source code
new_dot_code = change_depth(tree_plot.body, depth = 2)

# Convert the DOT source code to a graph
graph = pydotplus.graph_from_dot_data(new_dot_code)
graph.write_png('xgb_tree_16.png')
Image(graph.create_png(), width=800)
Out[337]:
Explore Important Splitting Points for Important Features
In [338]:
# Enumerate the numeric features to a dictionary
col_dict = {col: idx for idx, col in enumerate(trip_num_features_cls)}    
In [339]:
# Obtain the split value histogram for the duration feature
xgb_split_duration = xgb_full.get_booster().get_split_value_histogram('duration')
In [340]:
def scale_col(x, col):
    '''The function scales the data back for result interpretation.'''
    scaler_min = cls_scaler.data_min_[col_dict[col]]
    scaler_max = cls_scaler.data_max_[col_dict[col]]
    return x * (scaler_max - scaler_min) + scaler_min
In [341]:
# Scale back the duration data and change the unit to minute
xgb_split_duration['duration_minute'] = xgb_split_duration.SplitValue.apply(lambda x: np.exp(scale_col(x, 'duration'))) / 60
In [342]:
# Use 5 min interval counts.
xgb_split_duration['interval_5min'] = (xgb_split_duration.duration_minute // 5 + 1) * 5
xgb_split_duration.groupby('interval_5min')['Count'].sum().plot.bar(figsize=(15, 5))
plt.show()
In [343]:
# Calculate the mean value when duration is 10 min.
xgb_split_duration.loc[xgb_split_duration.interval_5min == 10, 'SplitValue'].mean()
Out[343]:
0.29824502332015806

This above graph shows that the most important splitting point for the duration is at 10 min. 5, 15, and 20 minutes are also important in different conditions.

In [344]:
num_features = trip_num_features_cls[1:] # Don't include the duration as it has been explored above.
In [345]:
num_div = 5 # number of division for the whole range
interval = 1 / num_div # the interval on the normalized values

xgb_splits_max = {}
xgb_splits_max['duration'] = 10
plt_idx = 1 # The subplot index starts from 1
r_color = lambda: random.randint(0,255) # function to generate random color: https://stackoverflow.com/questions/13998901/generating-a-random-hex-color-in-python
plt.figure(figsize=(16, 16))
for col in num_features:
    # Obtain the split values from the xgboost
    split_hist = xgb_full.get_booster().get_split_value_histogram(col)
    split_hist['interval'] = (split_hist['SplitValue'] // interval + 1) * interval # Interval starting not starting from zero.
    split_hist['scaled'] = split_hist['interval'].apply(lambda x: scale_col(x, col))
    
    #xgb_splits[col] = split_hist.groupby('scaled')['Count'].sum()
    split_group = split_hist.groupby('scaled')['Count'].sum()
    xgb_splits_max[col] = split_group.idxmax()
    
    bar_width = 0.1 * (max(split_hist.scaled) - min(split_hist.scaled))
    bar_color = '#%02X%02X%02X' % (r_color(),r_color(),r_color())
    
    plt.subplot(4, 4, plt_idx)
    plt.bar(split_group.index, split_group, width=bar_width, color=bar_color)
    plt.xlabel('%s' % col)
    plt_idx += 1
In [346]:
# Only explore the first ten important features.
xgb_importance = pd.Series(xgb_full.feature_importances_, index=X_train_cls.columns)
xgb_splits_max = pd.Series(xgb_splits_max)
xgb_unique = pd.Series(unique_dict)

xgb_important_splits = pd.concat([xgb_importance, xgb_splits_max, xgb_unique], axis=1)
xgb_important_splits.columns = ['Importance', 'MaxSplitPoint', 'Uniqueness']
xgb_important_splits.sort_values('Importance', ascending=False)[0:10]
Out[346]:
Importance MaxSplitPoint Uniqueness
duration 0.125788 10.000 13458
hour 0.109666 18.400 24
Wind_Deg 0.093006 360.000 327
Pressure 0.070436 30.018 81
Humid 0.067045 81.600 84
Temp 0.064098 65.600 48
Dew 0.061763 50.800 69
Gust 0.054462 45.600 46
month 0.052183 9.800 12
Max_Wind 0.048551 25.600 50

Shown above is the most important splitting point for each feature for the 10 most important features for information gaining on decision trees. These ten features are all numerical, not categorical. This is because the importance is calculated by the times this features appears in the splitting. Numerical values appear more often as they have more different values to split than the categorical. In fact, the following plot with the feature value uniqueness vs. the feature importance shows a clear correlation. This suggests that the importance itself cannot be used to interpret the result. However, the most frequent splitting points, together with the graph of the decision tree, may give shed some light on the interpretation.

In [347]:
# Plot the feature value uniqueness vs. the feature importance and find a strong correlation.
plt.figure(figsize=(5, 5))
plt.scatter(xgb_important_splits.Importance, np.log(xgb_important_splits.Uniqueness))
plt.xlabel('Feature Importance')
plt.ylabel('Logarithm of the Number of Unique Values')
plt.show()
Result Interpretation from Ridge Classifier
In [348]:
# Obtain the coefficients from the Lasso fitting.
importance_Ridge = pd.Series(dict(zip(X_train_cls.columns, rc_full.coef_[0]))).reset_index()
importance_Ridge['abs_coef'] = np.abs(importance_Ridge[0])
importance_Ridge.columns = ['features', 'coefficient', 'abs_coef']
importance_Ridge_sorted = importance_Ridge.sort_values('abs_coef', ascending=False).set_index('features')
In [349]:
# Calculate the mean of nonzero values in the feature column.
# The feature importrance will be adjusted by multipling the mean value. 
# If a feature has a value distribute at the low value end (close to 0 in (0, 1) scale), the feature tends to be overemphasized on its coefficient. It will be adjusted by multipling the feature's mean value.
feature_mean_cls = []
for col in X_train_cls[importance_Ridge_sorted.index]:
    feature_col_cls = X_train_cls[col]
    feature_mean_cls.append(feature_col_cls.iloc[feature_col_cls.nonzero()[0]].mean())

importance_Ridge_sorted['mean_adjusted'] = importance_Ridge_sorted['coefficient'] * feature_mean_cls
importance_Ridge_sorted['abs_mean_adjusted'] = importance_Ridge_sorted['abs_coef'] * feature_mean_cls
In [350]:
# Plot the coefficient column only to demonstrate the signs.
importance_Ridge_sorted[['coefficient', 'mean_adjusted']].plot.bar(figsize=(20, 5), ylim=(-0.8,0.8), grid=True, fontsize=14)
plt.xlabel('')
plt.show()
In [351]:
# Plot columns with absolute values to demonstrate the relative importance. As the coefficient differ significantly from feature to feature, logarithm is used for better observation.
importance_Ridge_sorted[['abs_coef', 'abs_mean_adjusted']].plot.bar(figsize=(20, 5), ylim=(10e-5,3), grid=True, fontsize=14, logy=True)
plt.xlabel('')
plt.show()

2.10 Remove the Unimportant Features

Remove Unimportant Features Selected by Ridge Classifier
In [352]:
# Absolute adjusted coefficients below 10% of the maximum coefficient will be considered not important.
not_important = max(importance_Ridge_sorted.abs_mean_adjusted) / 10
not_important_features = importance_Ridge_sorted[importance_Ridge_sorted.abs_mean_adjusted < not_important].index
In [353]:
X_train_less = X_train_cls.drop(not_important_features, axis=1)
X_test_less = X_test_cls.drop(not_important_features, axis=1)
In [354]:
X_train_less.columns
Out[354]:
Index(['duration', 'hour', 'BDay', 'local_zip', 'before_Oct2013',
       'station_region_94041', 'station_region_94063', 'station_region_94301',
       'station_region_95113'],
      dtype='object')
In [355]:
%%time
# Fit the training set with the best classifiers and predict on the X_test_cls data.
xgb_full.fit(X_train_less, y_train_cls)
y_pred_xgb_less = xgb_full.predict(X_test_less)
fscore_xgb_less = fbeta_score(y_test_cls, y_pred_xgb_less, beta=0.5)
print('The F0.5 score on the testing set with the optimized XGBClassifier is {:.4f}.'.format(fscore_xgb_less))

rc_full.fit(X_train_less, y_train_cls)
y_pred_rc_less = rc_full.predict(X_test_less)
fscore_rc_less = fbeta_score(y_test_cls, y_pred_rc_less, beta=0.5)
print('The F0.5 score on the testing set with the optimized RidgeClassifier is {:.4f}.'.format(fscore_rc_less))
The F0.5 score on the testing set with the optimized XGBClassifier is 0.8434.
The F0.5 score on the testing set with the optimized RidgeClassifier is 0.8070.
CPU times: user 10.9 s, sys: 1.11 s, total: 12 s
Wall time: 12 s
Remove Unimportant Features (by weight) Selected from XGBClassifier
In [356]:
# Absolute adjusted coefficients below 10% of the maximum coefficient will be considered not important.
not_important_2 = max(xgb_importance) / 10
not_important_features_2 = xgb_importance[xgb_importance < not_important_2].index
In [357]:
X_train_less_2 = X_train_cls.drop(not_important_features_2, axis=1)
X_test_less_2 = X_test_cls.drop(not_important_features_2, axis=1)
In [358]:
X_train_less_2.shape
Out[358]:
(535022, 16)
In [359]:
%%time
# Fit the training set with the best classifiers and predict on the X_test_cls data.
xgb_full.fit(X_train_less_2, y_train_cls)
y_pred_xgb_less_2 = xgb_full.predict(X_test_less_2)
fscore_xgb_less_2 = fbeta_score(y_test_cls, y_pred_xgb_less_2, beta=0.5)
print('The F0.5 score on the testing set with the optimized XGBClassifier is {:.4f}.'.format(fscore_xgb_less_2))

rc_full.fit(X_train_less_2, y_train_cls)
y_pred_rc_less_2 = rc_full.predict(X_test_less_2)
fscore_rc_less_2 = fbeta_score(y_test_cls, y_pred_rc_less_2, beta=0.5)
print('The F0.5 score on the testing set with the optimized RidgeClassifier is {:.4f}.'.format(fscore_rc_less_2))
The F0.5 score on the testing set with the optimized XGBClassifier is 0.7226.
The F0.5 score on the testing set with the optimized RidgeClassifier is 0.6393.
CPU times: user 13.4 s, sys: 1.35 s, total: 14.8 s
Wall time: 15.2 s
Compare all importance rankings.
In [360]:
Ridge_Importance = pd.Series(minmax_scale(importance_Ridge_sorted.abs_mean_adjusted))
Ridge_Importance.index = importance_Ridge_sorted.index
In [361]:
cls_importance = pd.concat([xgb_importance_scaled, Ridge_Importance], axis = 1)
cls_importance.rename(columns = {0: 'Ridge'}, inplace=True)
In [362]:
cls_importance.sort_values('Ridge', ascending=False).plot.bar(grid=True, figsize=(20, 5), fontsize=14)
plt.show()

2.11 Importance Analysis on Dataset with Selected Features.

In [363]:
# The unscaled dataset is used instead as tree-based method is not sensitive to value normalization.
features_sel = subscriber_cls.drop('subscription_type', axis = 1).drop(not_important_features, axis=1)
subType_sel = (subscriber_cls.subscription_type == 'Customer').astype('int')
In [364]:
features_sel.shape
Out[364]:
(668778, 9)
In [365]:
# Split the 'features_sel' and 'subType_sel' data into training and testing sets
X_train_sel, X_test_sel, y_train_sel, y_test_sel = train_test_split(features_sel, subType_sel, test_size=0.2, random_state=random_state)

# Show the results of the split
print("Training set has {} samples.".format(X_train_sel.shape[0]))
print("Testing set has {} samples.".format(X_test_sel.shape[0]))
Training set has 535022 samples.
Testing set has 133756 samples.
In [366]:
X_train_sel.shape
Out[366]:
(535022, 9)
In [367]:
init_params_sel = {'tree_method': 'gpu_hist', 'predictor': 'gpu_predictor'}
xgb_sel = XGBClassifier(learning_rate= 0.1,
                         n_estimators=88,
                         max_depth= 7,
                         min_child_weight= 1,
                         gamma= 0,
                         subsample= 0.9,                         
                         colsample_bytree= 0.7,
                         reg_alpha= 0,
                         reg_lambda= 0,
                         objective= 'binary:logistic',
                         scale_pos_weight= 0.48,
                         random_state=random_state, **init_params_sel)
In [368]:
# Check the performance of the optimized XGBClassifier on the testing set.
xgb_sel.fit(X_train_sel, y_train_sel)
y_pred_xgb_sel = xgb_sel.predict(X_test_sel)
fscore_xgb_sel = fbeta_score(y_test_sel, y_pred_xgb_sel, beta=0.5)
print('The F0.5 score on the testing set with the optimized XGBClassifier is {:.4f}.'.format(fscore_xgb_sel))
The F0.5 score on the testing set with the optimized XGBClassifier is 0.8441.
In [369]:
xgb_sel_imWeight = pd.Series(xgb_sel.get_booster().get_score(importance_type='weight'))
xgb_sel_imGain = pd.Series(xgb_sel.get_booster().get_score(importance_type='gain'))
xgb_sel_imCover = pd.Series(xgb_sel.get_booster().get_score(importance_type='cover'))
In [370]:
# Concatenate all the importance series together to form a dataframe
xgb_imSel_df = pd.concat([xgb_sel_imWeight, xgb_sel_imGain, xgb_sel_imCover], axis=1)
xgb_imSel_df.columns = ['Weight', 'Gain', 'Cover']
In [371]:
# Change the gain and cover scales to logarithm to minimize the data skew.
xgb_imSel_df['LogGain'] = np.log10(xgb_imSel_df.Gain)
xgb_imSel_df['LogCover'] = np.log10(xgb_imSel_df.Cover)
In [372]:
# Normalize the three importances to 
xgb_imSel_scaled = pd.DataFrame(MinMaxScaler().fit_transform(xgb_imSel_df[['Weight', 'LogGain', 'LogCover']]))
xgb_imSel_scaled.index = xgb_imSel_df.index
xgb_imSel_scaled.columns = ['Weight', 'LogGain', 'LogCover']
In [373]:
Ridge_im = Ridge_Importance[Ridge_Importance >= not_important]
xgb_imSel_comp = pd.concat([xgb_imSel_scaled, Ridge_im], axis=1)
xgb_imSel_comp.rename(columns={0: 'Ridge'}, inplace=True)
In [374]:
xgb_imSel_comp.sort_values('Ridge', ascending=False).plot.bar(figsize=(20, 5), grid=True, fontsize=14, )
plt.ylabel('Normalized Importance')
plt.show()
In [375]:
# Obtain the split value histograms for duration and hour
xgb_sel_duration = xgb_sel.get_booster().get_split_value_histogram('duration') / 60
xgb_sel_hour = xgb_sel.get_booster().get_split_value_histogram('hour')
In [376]:
time_interval = 5
xgb_sel_duration['interval_5min'] = (xgb_sel_duration['SplitValue'] // time_interval + 1) * time_interval
xgb_sel_duration.groupby('interval_5min')['Count'].sum().plot.bar(figsize=(15, 5))
plt.show()
In [377]:
hour_interval = 3
xgb_sel_hour['interval_3h'] = (xgb_sel_hour['SplitValue'] // hour_interval + 1) * hour_interval
xgb_sel_hour.groupby('interval_3h')['Count'].sum().plot.bar(figsize=(15, 5))
plt.show()
In [378]:
xgb_sel_duration.head()
Out[378]:
SplitValue Count interval_5min
0 2.939855 4.400000 5.0
1 4.163043 2.933333 5.0
2 5.386232 4.400000 10.0
3 6.609420 5.016667 10.0
4 7.832609 3.750000 10.0

2.12 Explore PCA on the X_train_less

In [379]:
# Explore the most important components when PCA is applied to X_train_less. Four components will explain more than 80% of the variance, so four should be good enough.
pca = PCA(n_components=4, random_state=random_state).fit(X_train_less)
print('Totally explained variance is {:.3f}.'.format(sum(pca.explained_variance_ratio_)))
Totally explained variance is 0.804.
In [380]:
# Plot the PCA components.
plt_idx = 1
plt.figure(figsize=(20, 5))
for idx, comp in enumerate(pca.components_):    
    bar_color = '#%02X%02X%02X' % (r_color(),r_color(),r_color())
    
    plt.subplot(1, 4, plt_idx)
    pd.Series(comp, index=X_train_less.columns).plot.bar(sharex=True, grid=True)
    plt.title('Explain Variance Ration: {:.3f}'.format(pca.explained_variance_ratio_[idx]))
    plt_idx += 1

plt.show()

2.13 Can Deep Learning Improve the Prediction?

MLP for binary classification

In [381]:
def get_available_devices():  
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos]

with sys_pipes():
    print(get_available_devices())
2018-01-10 18:08:13.540917: I tensorflow/core/platform/cpu_feature_guard.cc:137] Your CPU supports instructions that this TensorFlow binary was not compiled to use: SSE4.1 SSE4.2 AVX FMA
2018-01-10 18:08:13.551778: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:892] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2018-01-10 18:08:13.552736: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1030] Found device 0 with properties: 
name: GeForce GTX 1050 Ti major: 6 minor: 1 memoryClockRate(GHz): 1.392
pciBusID: 0000:01:00.0
totalMemory: 3.94GiB freeMemory: 1.73GiB
2018-01-10 18:08:13.552787: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1120] Creating TensorFlow device (/device:GPU:0) -> (device: 0, name: GeForce GTX 1050 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1)
['/device:CPU:0', '/device:GPU:0']

Follow instructions in the following link: https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/

To use Keras models with scikit-learn, we must use the KerasClassifier wrapper. This class takes a function that creates and returns our neural network model. It also takes arguments that it will pass along to the call to fit() such as the number of epochs and the batch size.

2.12.1 Baseline Model

Strangly: https://github.com/keras-team/keras/wiki/Keras-2.0-release-notes Losses & metrics: The objectives module has been renamed losses. Several legacy metric functions have been removed, namely matthews_correlation, precision, recall, fbeta_score, fmeasure. Custom metric functions can no longer return a dict, they must return a single tensor. https://medium.com/@thongonary/how-to-compute-f1-score-for-each-epoch-in-keras-a1acd17715a2

In [382]:
# Create a f0.5 metric function from keras following the code:
# https://www.kaggle.com/arsenyinfo/f-beta-score-for-keras
def fbeta(y_true, y_pred):
    beta = 0.5 # Set the beta to 0.5 to calculate the f0.5 score

    # just in case of hipster activation at the final layer
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    beta_squared = beta ** 2
    return (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall + K.epsilon())
In [383]:
def get_best_model(model, X_train, y_train, X_cross, y_cross, cp_filepath, class_weight=None, 
                   epochs=100, batch_size=1000, cp_verbose=0, fit_verbose=0):
    '''The monitor is changed from the default 'val_loss' to 'fbeta' to monitor the F0.5 score change.
    The mode must be set to 'max' as the fbeta score increases as the performance increases. 
    https://github.com/keras-team/keras/pull/188'''
    
    checkpointer = ModelCheckpoint(filepath=cp_filepath, monitor='val_fbeta', verbose=cp_verbose, 
                                   save_best_only=True, mode='max')
    
    history = model.fit(X_train, y_train, validation_data=(X_cross, y_cross),                        
                        epochs=epochs, batch_size=batch_size, callbacks=[checkpointer], 
                        verbose=fit_verbose, class_weight=class_weight, shuffle=True)
    
    model.load_weights(cp_filepath)
    
    return history
In [384]:
def deep_cv(model, X_train, y_train, X_test, y_test, cp_filepath, class_weight=None, epochs=150, 
            batch_size=1000, cp_verbose=0, fit_verbose=0):
    '''The default batch size is set to 10000, which should be good for a small neural network.'''
    cvscores = []
    iter_i = 0
    for train, test in cv_sets_cls.split(X_train, y_train):
        
        get_best_model(model, X_train.iloc[train], y_train.iloc[train], X_train.iloc[test], y_train.iloc[test], 
                       cp_filepath=cp_filepath, class_weight=class_weight, epochs=epochs, batch_size=batch_size, 
                       cp_verbose=cp_verbose, fit_verbose=fit_verbose)
        
        scores = model.evaluate(X_train.iloc[test], y_train.iloc[test], verbose=fit_verbose) # evaluate the model
        print("{}: {:.4f}".format(model.metrics_names[1], scores[1]))
        cvscores.append(scores[1])
        
        # https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model
        if cvscores[iter_i] - cvscores[iter_i - 1] > 0:
            model.save_weights(cp_filepath)
    
    model.load_weights(cp_filepath)
    test_beta_score = model.evaluate(X_test, y_test, batch_size=batch_size)[1]
    
    print('\n')
    print('The average F0.5 score of the cross validataion is {:.4f} (+/- {:.4f}).'.format(np.mean(cvscores), np.std(cvscores)))  
    print('The F0.5 score for the testing set is {:.4f}.'.format(test_beta_score))
    print('\n')
In [385]:
def plot_learning_history(model, X_train, y_train, cp_filepath, class_weight=None, epochs=100, 
                          batch_size=1000, cp_verbose=0, fit_verbose=0):
    
    cv_generator = cv_sets_cls.split(X_train, y_train)
    train_idx, cross_idx = next(cv_generator)
    
    history = get_best_model(model, X_train.iloc[train_idx], y_train.iloc[train_idx], 
                                    X_train.iloc[cross_idx], y_train.iloc[cross_idx], 
                             cp_filepath, class_weight=class_weight, epochs=epochs, 
                             batch_size=batch_size, cp_verbose=cp_verbose, fit_verbose=fit_verbose)
    
    model.load_weights(cp_filepath)
    
    # https://machinelearningmastery.com/custom-metrics-deep-learning-keras-python/
    plt.figure(figsize=(10, 5))
    plt.plot(history.history['val_fbeta'])
    plt.xlabel('Epoch')
    plt.ylabel('F0.5 Score')
    plt.title('Learning History')
    plt.grid()
    plt.show()
    
    print('The best F0.5 score is {:.4f}.'.format(max(history.history['val_fbeta'])))
Train on the downsampled dataset with the baseline model
In [386]:
# Set a repeatable initializer. The 'TruncatedNormal' is the recommended initializer by Keras.
kernal_init = K_init.TruncatedNormal(mean=0.0, stddev=0.05, seed=random_state)
In [387]:
# baseline model from: https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/
model_dim = X_train_cls.shape[1]
def create_base(dim=model_dim):
    model = Sequential()
    model.add(Dense(dim, input_dim=dim, kernel_initializer=kernal_init, activation='relu'))
    model.add(Dense(1, kernel_initializer=kernal_init, activation='sigmoid'))
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[fbeta])
    # model.summary()
    
    return model

From Keras document:

class_weight: Optional dictionary mapping class indices (integers) to a weight (float) value, used for weighting the loss function (during training only). This can be useful to tell the model to "pay more attention" to samples from an under-represented class.

In [388]:
%%time
# Test the simplest model trained by the downsampled dataset on the testing set.
base1 = create_base()

batch_size = max(len(y_train_down), len(y_test_cls))
cp_filepath = 'saved_models/weights.best.base_model_down_noWeight.hdf5'

deep_cv(base1, X_train_down, y_train_down, X_test_cls, y_test_cls, cp_filepath, 
        class_weight=None, batch_size=batch_size, epochs=50)
fbeta: 0.5191
fbeta: 0.5113
fbeta: 0.5187
fbeta: 0.5177
fbeta: 0.5467
fbeta: 0.5514
fbeta: 0.5968
fbeta: 0.6225
fbeta: 0.5558
fbeta: 0.5181
133756/133756 [==============================] - 0s 2us/step


The average F0.5 score of the cross validataion is 0.5458 (+/- 0.0358).
The F0.5 score for the testing set is 0.4661.


CPU times: user 27.1 s, sys: 1.04 s, total: 28.2 s
Wall time: 27.8 s
In [389]:
# Figure out the best class_weight
pos_train_down = y_train_down.sum()
neg_train_down = len(y_train_down) -  pos_train_down
print('The ratio between the negative and positive labels is {:.1f}'.format(neg_train_down / pos_train_down))

pos_weight_down = neg_train_down / pos_train_down / 2
class_weight_down = {0:1, 1:pos_weight_down}
The ratio between the negative and positive labels is 1.2
In [390]:
%%time
# Still trained with the downsampled. The sample weight added.
base2 = create_base()

cp_filepath = 'saved_models/weights.best.base_model_down_withWeight.hdf5'

deep_cv(base2, X_train_down, y_train_down, X_test_cls, y_test_cls, cp_filepath, 
        class_weight=class_weight_down, batch_size=batch_size, epochs=50)
fbeta: 0.4221
fbeta: 0.5271
fbeta: 0.4822
fbeta: 0.4605
fbeta: 0.4253
fbeta: 0.5032
fbeta: 0.5483
fbeta: 0.4747
fbeta: 0.4697
fbeta: 0.5189
133756/133756 [==============================] - 0s 2us/step


The average F0.5 score of the cross validataion is 0.4832 (+/- 0.0396).
The F0.5 score for the testing set is 0.5253.


CPU times: user 25.7 s, sys: 672 ms, total: 26.4 s
Wall time: 25.2 s

50 epochs is good enough. I have tried 1000 epochs, which doesn't give any improvement. As we can see, the performances on both the training set and the testing set are very bad. This is bad even for a baseline model.

I check the performance of the model with and without the custom class weights. It turns out that the one with the custom class weights perform worse. This could be a random thing, so I will explore this parameter further with the full dataset. The weights with the better model will be used as the initial weights for the full dataset.

Train on the full dataset with the baseline model
In [391]:
%%time
base3 = create_base()
base3.load_weights('saved_models/weights.best.base_model_down_noWeight.hdf5')

batch_size = max(len(y_train_cls), len(y_test_cls))
cp_filepath = 'saved_models/weights.best.base_model_full_noWeight.hdf5'

deep_cv(base3, X_train_cls, y_train_cls, X_test_cls, y_test_cls, cp_filepath, 
        class_weight=None, epochs=10, batch_size=batch_size)
fbeta: 0.7399
fbeta: 0.7328
fbeta: 0.7290
fbeta: 0.7230
fbeta: 0.6936
fbeta: 0.7000
fbeta: 0.6869
fbeta: 0.6853
fbeta: 0.6652
fbeta: 0.6560
133756/133756 [==============================] - 0s 2us/step


The average F0.5 score of the cross validataion is 0.7012 (+/- 0.0276).
The F0.5 score for the testing set is 0.7579.


CPU times: user 5min 12s, sys: 22.3 s, total: 5min 34s
Wall time: 5min 22s
In [392]:
pos_train_full = y_train_cls.sum()
neg_train_full = len(y_train_cls) -  pos_train_full
print('The ratio between the negative and positive labels is {:.1f}'.format(neg_train_full / pos_train_full))

pos_weight_full = neg_train_full / pos_train_full / 2
class_weight_full = {0:1, 1:pos_weight_full}
The ratio between the negative and positive labels is 5.5
In [393]:
%%time
base4 = create_base()
base4.load_weights('saved_models/weights.best.base_model_down_noWeight.hdf5')

cp_filepath = 'saved_models/weights.best.base_model_full_withWeight.hdf5'

deep_cv(base4, X_train_cls, y_train_cls, X_test_cls, y_test_cls, cp_filepath, 
        class_weight=class_weight_full, epochs=10, batch_size=batch_size)
fbeta: 0.7358
fbeta: 0.7368
fbeta: 0.7389
fbeta: 0.7474
fbeta: 0.7378
fbeta: 0.7499
fbeta: 0.7432
fbeta: 0.7429
fbeta: 0.7492
fbeta: 0.7380
133756/133756 [==============================] - 0s 2us/step


The average F0.5 score of the cross validataion is 0.7420 (+/- 0.0050).
The F0.5 score for the testing set is 0.7804.


CPU times: user 5min 13s, sys: 22.5 s, total: 5min 36s
Wall time: 5min 23s

It turns out that only 10 epochs of the training on the full training set will give fairly good result! Deep learning doesn't like the downsampled training set. This is the nature of deep learning: more information will lead to better performance.

Note that different validation splits give similar scores. The testing score is also similar. This is a very good sign. It suggests that stratified K-fold well separates different subsets so that each one is representative of the data, including the testing set. As deep neural network is very time-consuming, typically cross validation is not affordable. This discovery gives me more confidence in training a good model based on the training set and a single cross validation set.

Check if more epochs will improve the results.

In [394]:
%%time
base5 = create_base()
base5.load_weights('saved_models/weights.best.base_model_full_withWeight.hdf5') # Picking up from the model with weights.

epochs = 10 # Check 10 more epochs on top of the original 10 epochs.
batch_size = len(y_train_cls)
cp_filepath = 'saved_models/weights.best.base_model_full_withWeight_moreEpochs.hdf5'

plot_learning_history(base5, X_train_cls, y_train_cls, cp_filepath=cp_filepath, epochs=epochs, 
                      class_weight=class_weight_full, batch_size=batch_size, cp_verbose=1, fit_verbose=0)
Epoch 00001: val_fbeta improved from -inf to 0.77243, saving model to saved_models/weights.best.base_model_full_withWeight_moreEpochs.hdf5
Epoch 00002: val_fbeta did not improve
Epoch 00003: val_fbeta did not improve
Epoch 00004: val_fbeta did not improve
Epoch 00005: val_fbeta did not improve
Epoch 00006: val_fbeta did not improve
Epoch 00007: val_fbeta did not improve
Epoch 00008: val_fbeta did not improve
Epoch 00009: val_fbeta did not improve
Epoch 00010: val_fbeta did not improve
The best F0.5 score is 0.7724.
CPU times: user 28.3 s, sys: 2.02 s, total: 30.3 s
Wall time: 30 s
In [395]:
# Evaluate the testing set.
# base5.load_weights(cp_filepath)
base5.evaluate(X_test_cls, y_test_cls, batch_size=batch_size)[1]
133756/133756 [==============================] - 0s 2us/step
Out[395]:
0.77144104242324829

There is an improvement with additional 5 epochs. The result is still not better than the best classifier so far.

Improved Model
Tried:
  1. Optimizers. Learning rate and the decay of the optimizer.
  2. Batch sizes.
  3. Dense and dropout Layers.
  4. Kernal initializer.
  5. Direct work on the full dataset vs first on the downsampled.
  6. Activation. Relu is the best, better than Sigmoid.
  7. Epochs. Now it can reach 0.84 in 200 epochs.
  8. Class weights.

DON'T USE CLASS WEIGHTS FOR TRAINING THE FULL DATASET!!

In [396]:
# Set a higher learning rate for Adam (default 0.001). The deeper the network, the smaller the maximum learning rate is.
opt_Adam = Adam(lr=0.03, decay=0.0)

# Improved model from: https://machinelearningmastery.com/binary-classification-tutorial-with-the-keras-deep-learning-library/
model_dim = X_train_cls.shape[1]

best_model = Sequential()
best_model.add(Dense(model_dim * 1, input_dim=model_dim, kernel_initializer=kernal_init, activation='relu'))
best_model.add(Dense(model_dim // 2, kernel_initializer=kernal_init, activation='relu'))
best_model.add(Dense(model_dim // 2, kernel_initializer=kernal_init, activation='relu'))
best_model.add(Dense(model_dim // 3, kernel_initializer=kernal_init, activation='relu'))
best_model.add(Dense(1, kernel_initializer=kernal_init, activation='sigmoid'))

best_model.compile(loss='binary_crossentropy', optimizer=opt_Adam, metrics=[fbeta])
best_model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_11 (Dense)             (None, 30)                930       
_________________________________________________________________
dense_12 (Dense)             (None, 15)                465       
_________________________________________________________________
dense_13 (Dense)             (None, 15)                240       
_________________________________________________________________
dense_14 (Dense)             (None, 10)                160       
_________________________________________________________________
dense_15 (Dense)             (None, 1)                 11        
=================================================================
Total params: 1,806
Trainable params: 1,806
Non-trainable params: 0
_________________________________________________________________
In [397]:
%%time
# Improve the parameters on the full dataset with the pre-trained model.
# This is just a demo. The best model has been saved.
epochs = 200

# The optimum batch size is about the length of the training set divided by 500, which equals to 1070.
# The class weight is set to None.
batch_size = max(len(y_train_cls), len(y_test_cls)) // 1000
cp_filepath = 'saved_models/weights.best.best_model.hdf5'
plot_learning_history(best_model, X_train_cls, y_train_cls, cp_filepath=cp_filepath, 
                      class_weight=None, epochs=epochs, batch_size=batch_size, cp_verbose=True)
Epoch 00001: val_fbeta improved from -inf to 0.80757, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00002: val_fbeta did not improve
Epoch 00003: val_fbeta improved from 0.80757 to 0.81332, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00004: val_fbeta did not improve
Epoch 00005: val_fbeta did not improve
Epoch 00006: val_fbeta did not improve
Epoch 00007: val_fbeta did not improve
Epoch 00008: val_fbeta improved from 0.81332 to 0.82159, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00009: val_fbeta did not improve
Epoch 00010: val_fbeta did not improve
Epoch 00011: val_fbeta did not improve
Epoch 00012: val_fbeta did not improve
Epoch 00013: val_fbeta did not improve
Epoch 00014: val_fbeta did not improve
Epoch 00015: val_fbeta did not improve
Epoch 00016: val_fbeta did not improve
Epoch 00017: val_fbeta did not improve
Epoch 00018: val_fbeta did not improve
Epoch 00019: val_fbeta improved from 0.82159 to 0.82408, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00020: val_fbeta did not improve
Epoch 00021: val_fbeta did not improve
Epoch 00022: val_fbeta did not improve
Epoch 00023: val_fbeta did not improve
Epoch 00024: val_fbeta did not improve
Epoch 00025: val_fbeta did not improve
Epoch 00026: val_fbeta did not improve
Epoch 00027: val_fbeta did not improve
Epoch 00028: val_fbeta did not improve
Epoch 00029: val_fbeta did not improve
Epoch 00030: val_fbeta did not improve
Epoch 00031: val_fbeta did not improve
Epoch 00032: val_fbeta did not improve
Epoch 00033: val_fbeta did not improve
Epoch 00034: val_fbeta did not improve
Epoch 00035: val_fbeta did not improve
Epoch 00036: val_fbeta did not improve
Epoch 00037: val_fbeta did not improve
Epoch 00038: val_fbeta did not improve
Epoch 00039: val_fbeta did not improve
Epoch 00040: val_fbeta did not improve
Epoch 00041: val_fbeta did not improve
Epoch 00042: val_fbeta did not improve
Epoch 00043: val_fbeta did not improve
Epoch 00044: val_fbeta did not improve
Epoch 00045: val_fbeta did not improve
Epoch 00046: val_fbeta improved from 0.82408 to 0.82435, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00047: val_fbeta did not improve
Epoch 00048: val_fbeta did not improve
Epoch 00049: val_fbeta did not improve
Epoch 00050: val_fbeta did not improve
Epoch 00051: val_fbeta did not improve
Epoch 00052: val_fbeta did not improve
Epoch 00053: val_fbeta did not improve
Epoch 00054: val_fbeta did not improve
Epoch 00055: val_fbeta did not improve
Epoch 00056: val_fbeta improved from 0.82435 to 0.82483, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00057: val_fbeta did not improve
Epoch 00058: val_fbeta did not improve
Epoch 00059: val_fbeta did not improve
Epoch 00060: val_fbeta did not improve
Epoch 00061: val_fbeta did not improve
Epoch 00062: val_fbeta did not improve
Epoch 00063: val_fbeta did not improve
Epoch 00064: val_fbeta did not improve
Epoch 00065: val_fbeta did not improve
Epoch 00066: val_fbeta did not improve
Epoch 00067: val_fbeta did not improve
Epoch 00068: val_fbeta did not improve
Epoch 00069: val_fbeta did not improve
Epoch 00070: val_fbeta did not improve
Epoch 00071: val_fbeta did not improve
Epoch 00072: val_fbeta did not improve
Epoch 00073: val_fbeta did not improve
Epoch 00074: val_fbeta did not improve
Epoch 00075: val_fbeta did not improve
Epoch 00076: val_fbeta did not improve
Epoch 00077: val_fbeta did not improve
Epoch 00078: val_fbeta did not improve
Epoch 00079: val_fbeta did not improve
Epoch 00080: val_fbeta did not improve
Epoch 00081: val_fbeta did not improve
Epoch 00082: val_fbeta did not improve
Epoch 00083: val_fbeta did not improve
Epoch 00084: val_fbeta did not improve
Epoch 00085: val_fbeta did not improve
Epoch 00086: val_fbeta did not improve
Epoch 00087: val_fbeta did not improve
Epoch 00088: val_fbeta did not improve
Epoch 00089: val_fbeta did not improve
Epoch 00090: val_fbeta did not improve
Epoch 00091: val_fbeta did not improve
Epoch 00092: val_fbeta did not improve
Epoch 00093: val_fbeta did not improve
Epoch 00094: val_fbeta did not improve
Epoch 00095: val_fbeta did not improve
Epoch 00096: val_fbeta did not improve
Epoch 00097: val_fbeta did not improve
Epoch 00098: val_fbeta did not improve
Epoch 00099: val_fbeta did not improve
Epoch 00100: val_fbeta did not improve
Epoch 00101: val_fbeta did not improve
Epoch 00102: val_fbeta did not improve
Epoch 00103: val_fbeta did not improve
Epoch 00104: val_fbeta did not improve
Epoch 00105: val_fbeta did not improve
Epoch 00106: val_fbeta did not improve
Epoch 00107: val_fbeta improved from 0.82483 to 0.82506, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00108: val_fbeta did not improve
Epoch 00109: val_fbeta did not improve
Epoch 00110: val_fbeta did not improve
Epoch 00111: val_fbeta did not improve
Epoch 00112: val_fbeta did not improve
Epoch 00113: val_fbeta did not improve
Epoch 00114: val_fbeta did not improve
Epoch 00115: val_fbeta did not improve
Epoch 00116: val_fbeta did not improve
Epoch 00117: val_fbeta did not improve
Epoch 00118: val_fbeta did not improve
Epoch 00119: val_fbeta did not improve
Epoch 00120: val_fbeta did not improve
Epoch 00121: val_fbeta did not improve
Epoch 00122: val_fbeta did not improve
Epoch 00123: val_fbeta did not improve
Epoch 00124: val_fbeta did not improve
Epoch 00125: val_fbeta did not improve
Epoch 00126: val_fbeta improved from 0.82506 to 0.82721, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00127: val_fbeta improved from 0.82721 to 0.82779, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00128: val_fbeta did not improve
Epoch 00129: val_fbeta did not improve
Epoch 00130: val_fbeta did not improve
Epoch 00131: val_fbeta did not improve
Epoch 00132: val_fbeta did not improve
Epoch 00133: val_fbeta improved from 0.82779 to 0.83147, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00134: val_fbeta did not improve
Epoch 00135: val_fbeta did not improve
Epoch 00136: val_fbeta did not improve
Epoch 00137: val_fbeta did not improve
Epoch 00138: val_fbeta did not improve
Epoch 00139: val_fbeta did not improve
Epoch 00140: val_fbeta did not improve
Epoch 00141: val_fbeta did not improve
Epoch 00142: val_fbeta did not improve
Epoch 00143: val_fbeta did not improve
Epoch 00144: val_fbeta did not improve
Epoch 00145: val_fbeta did not improve
Epoch 00146: val_fbeta did not improve
Epoch 00147: val_fbeta did not improve
Epoch 00148: val_fbeta did not improve
Epoch 00149: val_fbeta did not improve
Epoch 00150: val_fbeta did not improve
Epoch 00151: val_fbeta did not improve
Epoch 00152: val_fbeta did not improve
Epoch 00153: val_fbeta did not improve
Epoch 00154: val_fbeta did not improve
Epoch 00155: val_fbeta did not improve
Epoch 00156: val_fbeta did not improve
Epoch 00157: val_fbeta did not improve
Epoch 00158: val_fbeta did not improve
Epoch 00159: val_fbeta did not improve
Epoch 00160: val_fbeta did not improve
Epoch 00161: val_fbeta did not improve
Epoch 00162: val_fbeta did not improve
Epoch 00163: val_fbeta did not improve
Epoch 00164: val_fbeta improved from 0.83147 to 0.83315, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00165: val_fbeta did not improve
Epoch 00166: val_fbeta did not improve
Epoch 00167: val_fbeta did not improve
Epoch 00168: val_fbeta did not improve
Epoch 00169: val_fbeta did not improve
Epoch 00170: val_fbeta did not improve
Epoch 00171: val_fbeta did not improve
Epoch 00172: val_fbeta did not improve
Epoch 00173: val_fbeta did not improve
Epoch 00174: val_fbeta did not improve
Epoch 00175: val_fbeta did not improve
Epoch 00176: val_fbeta did not improve
Epoch 00177: val_fbeta did not improve
Epoch 00178: val_fbeta did not improve
Epoch 00179: val_fbeta did not improve
Epoch 00180: val_fbeta did not improve
Epoch 00181: val_fbeta did not improve
Epoch 00182: val_fbeta did not improve
Epoch 00183: val_fbeta did not improve
Epoch 00184: val_fbeta did not improve
Epoch 00185: val_fbeta did not improve
Epoch 00186: val_fbeta did not improve
Epoch 00187: val_fbeta did not improve
Epoch 00188: val_fbeta did not improve
Epoch 00189: val_fbeta did not improve
Epoch 00190: val_fbeta did not improve
Epoch 00191: val_fbeta did not improve
Epoch 00192: val_fbeta did not improve
Epoch 00193: val_fbeta did not improve
Epoch 00194: val_fbeta did not improve
Epoch 00195: val_fbeta did not improve
Epoch 00196: val_fbeta did not improve
Epoch 00197: val_fbeta improved from 0.83315 to 0.83344, saving model to saved_models/weights.best.best_model.hdf5
Epoch 00198: val_fbeta did not improve
Epoch 00199: val_fbeta did not improve
Epoch 00200: val_fbeta did not improve
The best F0.5 score is 0.8334.
CPU times: user 36min 43s, sys: 1min 10s, total: 37min 53s
Wall time: 32min 22s

2.12.3 Evaluate the best model on the testing set

Load the weights from the best model.

In [398]:
best_model.load_weights('saved_models/weights.best.model_8.hdf5')
In [399]:
max_batch_size = len(y_train_cls)

Note the difference of the result when the batch size of the evaluation is set differently.

In [400]:
# This is the evaluation with the default batch size.
best_model.evaluate(X_test_cls, y_test_cls)[1]
133756/133756 [==============================] - 15s 112us/step
Out[400]:
0.80139359109102304
In [401]:
# This is the evaluation with the maximum batch size.
best_model.evaluate(X_test_cls, y_test_cls, max_batch_size)[1]
133756/133756 [==============================] - 0s 2us/step
Out[401]:
0.84171968698501587
In [402]:
%%time
# Predict the y values with the default batch size and the maximum batch size.
# Note that the result needs to be rounded in order to compare with y_test_cls as the raw output is float from sigmoid function.
y_pred_test_MLP_default = best_model.predict(X_test_cls).round()
y_pred_test_MLP_max = best_model.predict(X_test_cls, batch_size=max_batch_size).round()
CPU times: user 7.94 s, sys: 520 ms, total: 8.46 s
Wall time: 7.3 s
In [403]:
fbeta_score(y_test_cls, y_pred_test_MLP_default, beta=0.5)
Out[403]:
0.84171985486903789
In [404]:
fbeta_score(y_test_cls, y_pred_test_MLP_max, beta=0.5)
Out[404]:
0.84171985486903789

As seen above, if the predict function is used instead of the evaluation function, both results match the evaluation result with the maximum batch size. This must be a Keras bug. One should be careful in using the evaluation function.

In [405]:
# Evaluation on the X_train_cls
best_model.evaluate(X_train_cls, y_train_cls, batch_size=max_batch_size)[1]
535022/535022 [==============================] - 1s 2us/step
Out[405]:
0.84049493074417114
In [406]:
# Evaluation on the X_train_down
best_model.evaluate(X_train_down, y_train_down, batch_size=max_batch_size)[1]
10354/10354 [==============================] - 0s 3us/step
Out[406]:
0.75628030300140381

The model fits the whole dataset very well. However, the model doesn't work very well with the downsampled dataset.

In [407]:
%%time
MLP_importance = {}
for col in X_test_cls.columns:
    X_temp = X_test_cls.copy()
    X_temp[col] = 0
    X_temp_score = best_model.evaluate(X_temp, y_test_cls, batch_size=max_batch_size, verbose=0)[1]
    MLP_importance[col] = X_temp_score
CPU times: user 11.4 s, sys: 1.77 s, total: 13.2 s
Wall time: 13.5 s
In [408]:
MLP_importance_series = pd.Series(MLP_importance).sort_values()
In [409]:
MLP_importance_series.plot.bar(figsize=(15, 5), fontsize=14)
plt.ylabel('F0.5 score')
plt.title('Feature Impact on the Score')
plt.show()
In [410]:
MLP_importance_series[-2:]
Out[410]:
Fog_Rain    0.841786
Thunder     0.841829
dtype: float64
In [411]:
# Calculate the importance gain from the least important feature importance.
MLP_impact = -(MLP_importance_series - MLP_importance_series[-1])
In [412]:
# Scale the data to be comparable with other algorithms.
MLP_impact_norm = pd.Series(minmax_scale(MLP_impact))
MLP_impact_norm.index = MLP_impact.index
In [413]:
cls_importance_all = pd.concat([cls_importance[['Ridge', 'LogGain']], MLP_impact_norm], axis = 1)
cls_importance_all.rename(columns = {0: 'MLP', 'LogGain': 'XGB'}, inplace=True)
In [414]:
cls_importance_all.sort_values('Ridge', ascending=False).plot.bar(grid=True, figsize=(20, 5), fontsize=14)
plt.show()

The smaller the score, the more important the feature.

In [415]:
%%time
MLP_importance_series_2 = MLP_importance_series.copy()
MLP_index = MLP_importance_series.index

for col in MLP_index:
    X_temp = X_test_cls.copy()
    
    col_idx = MLP_index.get_loc(col) # Get the index of a specific column from the MLP_importance_series
    col_del = MLP_index[(col_idx+1):] # Set all columns after the specified column to value zero
    X_temp.loc[:, col_del] = 0
    
    X_temp_score = best_model.evaluate(X_temp, y_test_cls, batch_size=max_batch_size, verbose=0)[1]
    MLP_importance_series_2[col] = X_temp_score
CPU times: user 11.1 s, sys: 1.08 s, total: 12.2 s
Wall time: 12.3 s
In [416]:
MLP_importance_series_2.plot.bar(figsize=(15, 5), fontsize=14)
plt.ylabel('F0.5 score')
plt.title('Feature Impact on the Score')
plt.show()

The above figure is plotted with only features more important than the specific feature. Columns with less important features are all set to zero.

In [417]:
# These features are not useful for the MLP model.
MLP_importance_series_2[MLP_importance_series_2 > 0.84]
Out[417]:
Max_Wind       0.840355
Mean_Wind      0.840064
inter_group    0.840451
weekday        0.841687
Rain           0.841869
Wind_Deg       0.841895
Fog_Rain       0.841829
Thunder        0.841720
dtype: float64
In [418]:
# http://forums.fast.ai/t/how-could-i-release-gpu-memory-of-keras/2023/11
K.clear_session()
In [419]:
# Calculate the total time used for running the whole notebook.
notebook_end_time = time()
notebook_running_time = (notebook_end_time - notebook_start_time) / 60
print('Running the entire notebook takes {:.2f} minutes.'.format(notebook_running_time))
Running the entire notebook takes 104.07 minutes.